Python全面解读2018电影票房市场！

2018-11-22 本文已影响8人 919b0c54458f

双11已经过去，双12即将来临，离2018年的结束也就2个月不到，还记得年初立下的flag吗？

完成了多少？相信很多人和我一样，抱头痛哭...

本次利用猫眼电影，实现对2018年的电影大数据进行分析。

/ 01 / 网页分析

01 标签

集群：548377875 即可获取数十套PDF以及大量的学习教程！

通过点击猫眼电影已经归类好的标签，得到网址信息。

02 索引页

打开开发人员工具，获取索引页里电影的链接以及评分信息。

索引页一共有30多页，但是有电影评分的只有10页。

本次只对有电影评分的数据进行获取。

03 详情页

对详情页的信息进行获取。

主要是名称，类型，国家，时长，上映时间，评分，评分人数，累计票房。

/ 02 / 反爬破解

通过开发人员工具发现，猫眼针对评分，评分人数，累计票房的数据，施加了文字反爬。

通过查看网页源码，发现只要刷新页面，三处文字编码就会改变，无法直接匹配信息。

所以需要下载文字文件，对其进行双匹配。

from fontTools.ttLib import TTFont

#font = TTFont('base.woff')

#font.saveXML('base.xml')

font = TTFont('maoyan.woff')

font.saveXML('maoyan.xml')

将woff格式转换为xml格式，以便在Pycharm中查看详细信息。

利用下面这个网站，打开woff文件。

url: http://fontstore.baidu.com/static/editor/index.html

可以得到下面数字部分信息(上下两块)。

在Pycharm中查看xml格式文件(左右两块)，你就会发现有对应信息。

通过上图你就可以将数字6对上号了，其他数字一样的。

def get_numbers(u):

"""

对猫眼的文字反爬进行破解

"""

cmp = re.compile(", url('(//.*.woff)') format('woff')")

rst = cmp.findall(u)

ttf = requests.get("http:" + rst[0], stream=True)

with open("maoyan.woff", "wb") as pdf:

for chunk in ttf.iter_content(chunk_size=1024):

if chunk:

pdf.write(chunk)

base_font = TTFont('base.woff')

maoyanFont = TTFont('maoyan.woff')

maoyan_unicode_list = maoyanFont['cmap'].tables[0].ttFont.getGlyphOrder()

maoyan_num_list = []

base_num_list = ['.', '3', '0', '8', '9', '4', '1', '5', '2', '7', '6']

base_unicode_list = ['x', 'uniF561', 'uniE6E1', 'uniF125', 'uniF83F', 'uniE9E2', 'uniEEA6', 'uniEEC2', 'uniED38', 'uniE538', 'uniF8E7']

for i in range(1, 12):

maoyan_glyph = maoyanFont['glyf'][maoyan_unicode_list[i]]

for j in range(11):

base_glyph = base_font['glyf'][base_unicode_list[j]]

if maoyan_glyph == base_glyph:

maoyan_num_list.append(base_num_list[j])

break

maoyan_unicode_list[1] = 'uni0078'

utf8List = [eval(r"'\u" + uni[3:] + "'").encode("utf-8") for uni in maoyan_unicode_list[1:]]

utf8last = []

for i in range(len(utf8List)):

utf8List[i] = str(utf8List[i], encoding='utf-8')

utf8last.append(utf8List[i])

return (maoyan_num_list ,utf8last)

/ 03 / 数据获取

01 构造请求头

head = """

Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8

Accept-Encoding:gzip, deflate, br

Accept-Language:zh-CN,zh;q=0.8

Cache-Control:max-age=0

Connection:keep-alive

Host:maoyan.com

Upgrade-Insecure-Requests:1

Content-Type:application/x-www-form-urlencoded; charset=UTF-8

User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36

"""

def str_to_dict(header):

"""

构造请求头,可以在不同函数里构造不同的请求头

"""

header_dict = {}

header = header.split('')

for h in header:

h = h.strip()

if h:

k, v = h.split(':', 1)

header_dict[k] = v.strip()

return header_dict

因为索引页和详情页请求头不一样，这里为了简便，构造了一个函数。

02 获取电影详情页链接

def get_url():

"""

获取电影详情页链接

"""

for i in range(0, 300, 30):

time.sleep(10)

url = 'http://maoyan.com/films?showType=3&yearId=13&sortId=3&offset=' + str(i)

host = """Referer:http://maoyan.com/films?showType=3&yearId=13&sortId=3&offset=0

"""

header = head + host

headers = str_to_dict(header)

response = requests.get(url=url, headers=headers)

html = response.text

soup = BeautifulSoup(html, 'html.parser')

data_1 = soup.find_all('div', {'class': 'channel-detail movie-item-title'})

data_2 = soup.find_all('div', {'class': 'channel-detail channel-detail-orange'})

num = 0

for item in data_1:

num += 1

time.sleep(10)

url_1 = item.select('a')[0]['href']

if data_2[num-1].get_text() != '暂无评分':

url = 'http://maoyan.com' + url_1

for message in get_message(url):

print(message)

to_mysql(message)

print(url)

print('---------------^^^Film_Message^^^-----------------')

else:

print('The Work Is Done')

break

03 获取电影详情页信息

def get_message(url):

"""

获取电影详情页里的信息

"""

time.sleep(10)

data = {}

host = """refer: http://maoyan.com/news

"""

header = head + host

headers = str_to_dict(header)

response = requests.get(url=url, headers=headers)

u = response.text

# 破解猫眼文字反爬

(mao_num_list, utf8last) = get_numbers(u)

# 获取电影信息

soup = BeautifulSoup(u, "html.parser")

mw = soup.find_all('span', {'class': 'stonefont'})

score = soup.find_all('span', {'class': 'score-num'})

unit = soup.find_all('span', {'class': 'unit'})

ell = soup.find_all('li', {'class': 'ellipsis'})

name = soup.find_all('h3', {'class': 'name'})

# 返回电影信息

data["name"] = name[0].get_text()

data["type"] = ell[0].get_text()

data["country"] = ell[1].get_text().split('/')[0].strip().replace('', '')

data["length"] = ell[1].get_text().split('/')[1].strip().replace('', '')

data["released"] = ell[2].get_text()[:10]

# 因为会出现没有票房的电影,所以这里需要判断

if unit:

bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), unit[0].get_text()]

for i in range(len(mw)):

moviewish = mw[i].get_text().encode('utf-8')

moviewish = str(moviewish, encoding='utf-8')

# 通过比对获取反爬文字信息

for j in range(len(utf8last)):

moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])

if i == 0:

data["score"] = moviewish + bom[i]

elif i == 1:

if '万' in moviewish:

data["people"] = int(float(moviewish.replace('万', '')) * 10000)

else:

data["people"] = int(float(moviewish))

else:

if '万' == bom[i]:

data["box_office"] = int(float(moviewish) * 10000)

else:

data["box_office"] = int(float(moviewish) * 100000000)

else:

bom = ['分', score[0].get_text().replace('.', '').replace('万', ''), 0]

for i in range(len(mw)):

moviewish = mw[i].get_text().encode('utf-8')

moviewish = str(moviewish, encoding='utf-8')

for j in range(len(utf8last)):

moviewish = moviewish.replace(utf8last[j], maoyan_num_list[j])

if i == 0:

data["score"] = moviewish + bom[i]

else:

if '万' in moviewish:

data["people"] = int(float(moviewish.replace('万', '')) * 10000)

else:

data["people"] = int(float(moviewish))

data["box_office"] = bom[2]

yield data

/ 04 / 数据存储

01 创建数据库及表格

db = pymysql.connect(host='127.0.0.1', user='root', password='774110919', port=3306)

cursor = db.cursor()

cursor.execute("CREATE DATABASE maoyan DEFAULT CHARACTER SET utf8mb4")

db.close()

db = pymysql.connect(host='127.0.0.1', user='root', password='774110919', port=3306, db='maoyan')

cursor = db.cursor()

sql = 'CREATE TABLE IF NOT EXISTS films (name VARCHAR(255) NOT NULL, type VARCHAR(255) NOT NULL, country VARCHAR(255) NOT NULL, length VARCHAR(255) NOT NULL, released VARCHAR(255) NOT NULL, score VARCHAR(255) NOT NULL, people INT NOT NULL, box_office BIGINT NOT NULL, PRIMARY KEY (name))'

cursor.execute(sql)

db.close()

其中票房收入数据类型为BIGINT(19位数)，最大为18446744073709551615。

INT(10位数)，最大为2147483647，达不到36亿(3600000000)。

02 数据存储

def to_mysql(data):

"""

信息写入mysql

"""

table = 'films'

keys = ', '.join(data.keys())

values = ', '.join(['%s'] * len(data))