plugin.video.torrenter/resources/scrapers/kinopoisk/pageparser.py

177 lines
7.1 KiB
Python
Raw Normal View History

2015-01-09 14:11:21 +03:00
# -*- coding: utf-8 -*-
#
# Russian metadata plugin for Plex, which uses http://www.kinopoisk.ru/ to get the tag data.
# Плагин для обновления информации о фильмах использующий КиноПоиск (http://www.kinopoisk.ru/).
# Copyright (C) 2013 Yevgeny Nyden
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301, USA.
#
# @author zhenya (Yevgeny Nyden)
# @version 1.52
# @revision 148
import re
import urllib
import operator
import common
import pluginsettings as S
import translit
2015-06-23 23:00:27 +03:00
2015-12-15 17:18:06 +03:00
2015-01-09 14:11:21 +03:00
# MATCHER_MOVIE_DURATION = re.compile('\s*(\d+).*?', re.UNICODE | re.DOTALL)
2015-06-23 23:00:27 +03:00
# MATCHER_IMDB_RATING = re.compile('IMDb:\s*(\d+\.?\d*)\s*\(\s*([\s\d]+)\s*\)', re.UNICODE | re.DOTALL)
# MATCHER_IMDB_RATING = re.compile('IMDb:\s*(\d+\.?\d*)\s?\((.*)\)', re.UNICODE)
2015-01-09 14:11:21 +03:00
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.22 (KHTML, like Gecko) Chrome/25.0.1364.172 Safari/537.22'
'''
MOVIE_THUMBNAIL_SMALL_WIDTH = 130
MOVIE_THUMBNAIL_SMALL_HEIGHT = 168
MOVIE_THUMBNAIL_BIG_WIDTH = 780
MOVIE_THUMBNAIL_BIG_HEIGHT = 1024
# Compiled regex matchers.
MATCHER_WIDTH_FROM_STYLE = re.compile('.*width\s*:\s*(\d+)px.*', re.UNICODE)
MATCHER_HEIGHT_FROM_STYLE = re.compile('.*height\s*:\s*(\d+)px.*', re.UNICODE)
MATCHER_LEADING_NONALPHA = re.compile('^[\s\d\.\(\)]*', re.UNICODE | re.MULTILINE)
# Русские месяца, пригодятся для определения дат.
RU_MONTH = {
u'января': '01',
u'февраля': '02',
u'марта': '03',
u'апреля': '04',
u'мая': '05',
u'июня': '06',
u'июля': '07',
u'августа': '08',
u'сентября': '09',
u'октября': '10',
u'ноября': '11',
u'декабря': '12'
}
'''
class PageParser:
def __init__(self, logger, httpUtils=common.HttpUtils(S.ENCODING_KINOPOISK_PAGE, USER_AGENT), isDebug=False):
self.log = logger
self.isDebug = isDebug
self.httpUtils = httpUtils
def fetchAndParseSearchResults(self, mediaName, mediaYear, mediaAltName=None):
""" Searches for movie titles on KinoPoisk.
@param mediaName Movie title parsed from a filename.
@param mediaName Movie year parsed from a filename.
@return Array of tuples: [kinoPoiskId, title, year, score]
"""
self.log.Info('Quering kinopoisk...')
results = self.queryKinoPoisk(mediaName, mediaYear)
# Check media name is all ASCII characters, and if it is,
# issue another query to KinoPoisk using a translified media name;
# lastly, merge the scored results.
if common.isAsciiString(mediaName):
translifiedMediaName = translit.detranslify(mediaName)
moreResults = self.queryKinoPoisk(translifiedMediaName, mediaYear)
resultsMap = dict()
for result in results:
resultsMap[result[0]] = result
results = [] # Recreate and repopulate the results array removing duplicates.
for result in moreResults:
currId = result[0]
if currId in resultsMap.keys():
origResult = resultsMap[currId]
del resultsMap[currId]
if result[3] >= origResult[3]:
results.append(result)
else:
results.append(origResult)
else:
results.append(result)
2015-07-20 19:32:00 +03:00
results.extend(resultsMap.values())
2015-01-09 14:11:21 +03:00
if mediaAltName:
moreResults = self.queryKinoPoisk(mediaAltName, mediaYear)
resultsMap = dict()
for result in results:
resultsMap[result[0]] = result
results = [] # Recreate and repopulate the results array removing duplicates.
for result in moreResults:
currId = result[0]
if currId in resultsMap.keys():
origResult = resultsMap[currId]
del resultsMap[currId]
if result[3] >= origResult[3]:
results.append(result)
else:
results.append(origResult)
else:
results.append(result)
2015-07-20 19:32:00 +03:00
results.extend(resultsMap.values())
2015-01-09 14:11:21 +03:00
# Sort all results based on their score.
results.sort(key=operator.itemgetter(3))
results.reverse()
if self.isDebug:
self.log.Debug('Search produced %d results:' % len(results))
index = -1
for result in results:
index += 1
self.log.Debug(' ... %d: id="%s", name="%s", year="%s", score="%d".' %
(index, result[0], result[1], str(result[2]), result[3]))
return results
def queryKinoPoisk(self, mediaName, mediaYear):
""" Ищет фильм на кинопоиске.
Returns title results as they are returned (no sorting is done here!).
"""
results = []
try:
encodedName = urllib.quote(mediaName.encode(S.ENCODING_KINOPOISK_PAGE))
except:
encodedName = urllib.quote(mediaName.encode('utf-8'))
page = self.httpUtils.requestAndParseHtmlPage(S.KINOPOISK_SEARCH_SIMPLE % encodedName)
if page is None:
self.log.Warn(' ### nothing was found on kinopoisk for media name "%s"' % mediaName)
return results
# Страница получена, берем с нее перечень всех названий фильмов.
self.log.Debug('**** Finding %s (%s) got a KinoPoisk...' % (mediaName, mediaYear))
reobj = re.compile(r'<span><a href="http://m\.kinopoisk\.ru/movie/(\d+)/">(.+?)</a><br />(.+?)</span>')
result = reobj.findall(page)
# Inspect query results titles and score them.
self.log.Debug('found %d results (div info tags)' % len(result))
itemIndex = -1
for itemKinoPoiskId, itemTitleitemYear, itemAltTitle in result:
itemIndex = itemIndex + 1
itemAltTitle = itemAltTitle.replace('&nbsp;', '')
try:
itemTitle, itemYear = re.compile('^(.+?), (\d\d\d\d)$').findall(itemTitleitemYear.strip())[0]
except:
itemYear = None
itemTitle = itemTitleitemYear
itemScore = common.scoreMediaTitleMatch(mediaName, mediaYear, itemTitle, itemAltTitle, itemYear, itemIndex)
results.append([itemKinoPoiskId, itemTitle, itemYear, itemScore])
return results