抓取猫眼电影 Code

2018-09-01  本文已影响0人  其徐如林
import json, requests, re
from datetime import time

def get_one_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36'
    }
    try:
        response = requests.get(url,headers = headers)
        if response.status_code == 200:
            return response.text
    except BaseException as base:
        print(base)

def parse_one_page(html):
    # 匹配i节点中的排名信息
    rank = '<dd>.*?board-index.*?>(.*?)</i>'
    # 提取图片信息
    img = '.*?data-src="(.*?)"'
    # 提取电影名称
    name = '.*?<a.*?data-val=".*?">(.*?)</a>'
    # 提取主演
    act = '.*?class="star">(.*?)</p>'
    # 提取上映时间
    time = '.*?class="releasetime">(.*?)</p>'
    # 评分
    grade = '.*?class="integer">(.*?)</i><i.*?class=".*?">(.*?)</i>'
    regul = rank + img + name + act + time + grade

    pattern = re.compile(regul, re.S)
    results = re.findall(pattern, html)
    for result in results:
        yield {
            'index': result[0],
            'image': result[1],
            'title': result[2],
            'actor': result[3].strip()[3:],
            'time': result[4].strip()[4:],
            'score': result[5].strip() + result[6].strip()
        }

def write_json(data):
    with open('movie.json', 'a', encoding='utf-8') as w:
        json.dump(data, w)
        w.write('\n')

def main(offset):
    url = 'Http://maoyan.com/board/4?offset=' + str(offset)
    html = get_one_page(url)
    for i in parse_one_page(html):
        # print(i)
        write_json(i)
if __name__ == '__main__':
    for i in range(10):
        main(offset=i * 10)
        time.sleep(2)

上一篇下一篇

猜你喜欢

热点阅读