# -*- coding: utf-8 -*- # # Russian metadata plugin for Plex, which uses http://www.kinopoisk.ru/ to get the tag data. # Плагин для обновления информации о фильмах использующий КиноПоиск (http://www.kinopoisk.ru/). # Copyright (C) 2013 Yevgeny Nyden # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA # 02110-1301, USA. # # @author zhenya (Yevgeny Nyden) # @version 1.52 # @revision 148 import re import urllib import operator import common import pluginsettings as S import translit # MATCHER_MOVIE_DURATION = re.compile('\s*(\d+).*?', re.UNICODE | re.DOTALL) # MATCHER_IMDB_RATING = re.compile('IMDb:\s*(\d+\.?\d*)\s*\(\s*([\s\d]+)\s*\)', re.UNICODE | re.DOTALL) # MATCHER_IMDB_RATING = re.compile('IMDb:\s*(\d+\.?\d*)\s?\((.*)\)', re.UNICODE) USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22' ''' MOVIE_THUMBNAIL_SMALL_WIDTH = 130 MOVIE_THUMBNAIL_SMALL_HEIGHT = 168 MOVIE_THUMBNAIL_BIG_WIDTH = 780 MOVIE_THUMBNAIL_BIG_HEIGHT = 1024 # Compiled regex matchers. MATCHER_WIDTH_FROM_STYLE = re.compile('.*width\s*:\s*(\d+)px.*', re.UNICODE) MATCHER_HEIGHT_FROM_STYLE = re.compile('.*height\s*:\s*(\d+)px.*', re.UNICODE) MATCHER_LEADING_NONALPHA = re.compile('^[\s\d\.\(\)]*', re.UNICODE | re.MULTILINE) # Русские месяца, пригодятся для определения дат. RU_MONTH = { u'января': '01', u'февраля': '02', u'марта': '03', u'апреля': '04', u'мая': '05', u'июня': '06', u'июля': '07', u'августа': '08', u'сентября': '09', u'октября': '10', u'ноября': '11', u'декабря': '12' } ''' class PageParser: def __init__(self, logger, httpUtils=common.HttpUtils(S.ENCODING_KINOPOISK_PAGE, USER_AGENT), isDebug=False): self.log = logger self.isDebug = isDebug self.httpUtils = httpUtils def fetchAndParseSearchResults(self, mediaName, mediaYear, mediaAltName=None): """ Searches for movie titles on KinoPoisk. @param mediaName Movie title parsed from a filename. @param mediaName Movie year parsed from a filename. @return Array of tuples: [kinoPoiskId, title, year, score] """ self.log.Info('Quering kinopoisk...') results = self.queryKinoPoisk(mediaName, mediaYear) # Check media name is all ASCII characters, and if it is, # issue another query to KinoPoisk using a translified media name; # lastly, merge the scored results. if common.isAsciiString(mediaName): translifiedMediaName = translit.detranslify(mediaName) moreResults = self.queryKinoPoisk(translifiedMediaName, mediaYear) resultsMap = dict() for result in results: resultsMap[result[0]] = result results = [] # Recreate and repopulate the results array removing duplicates. for result in moreResults: currId = result[0] if currId in resultsMap.keys(): origResult = resultsMap[currId] del resultsMap[currId] if result[3] >= origResult[3]: results.append(result) else: results.append(origResult) else: results.append(result) results.extend(resultsMap.viewvalues()) if mediaAltName: moreResults = self.queryKinoPoisk(mediaAltName, mediaYear) resultsMap = dict() for result in results: resultsMap[result[0]] = result results = [] # Recreate and repopulate the results array removing duplicates. for result in moreResults: currId = result[0] if currId in resultsMap.keys(): origResult = resultsMap[currId] del resultsMap[currId] if result[3] >= origResult[3]: results.append(result) else: results.append(origResult) else: results.append(result) results.extend(resultsMap.viewvalues()) # Sort all results based on their score. results.sort(key=operator.itemgetter(3)) results.reverse() if self.isDebug: self.log.Debug('Search produced %d results:' % len(results)) index = -1 for result in results: index += 1 self.log.Debug(' ... %d: id="%s", name="%s", year="%s", score="%d".' % (index, result[0], result[1], str(result[2]), result[3])) return results def queryKinoPoisk(self, mediaName, mediaYear): """ Ищет фильм на кинопоиске. Returns title results as they are returned (no sorting is done here!). """ results = [] try: encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE)) except: encodedName = urllib.quote(mediaName.encode('utf-8')) page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName) if page is None: self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName) return results # Страница получена, берем с нее перечень всех названий фильмов. self.log.Debug('**** Finding %s (%s) got a KinoPoisk...' % (mediaName, mediaYear)) reobj = re.compile(r'(.+?)
(.+?)
') result = reobj.findall(page) # Inspect query results titles and score them. self.log.Debug('found %d results (div info tags)' % len(result)) itemIndex = -1 for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result: itemIndex = itemIndex + 1 itemAltTitle = itemAltTitle.replace(' ', '') try: itemTitle, itemYear = re.compile('^(.+?), (\d\d\d\d)$').findall(itemTitleitemYear.strip())[0] except: itemYear = None itemTitle = itemTitleitemYear itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex) results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore]) return results