Python爬虫

Python抓取豆瓣电影一周口碑榜

2019-08-29  本文已影响0人  龙小江i

作为练手项目,没想过要精简代码,重要的是在爬取的过程中找到思路。

import requests
import re
import pandas as pd
import openpyxl
import xlwings
import time
from bs4 import BeautifulSoup

# 爬取口碑列表
print("正在爬取数据,请稍等...")
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36"}
url = "https://movie.douban.com/"
get = requests.get(url,headers=headers)
text = BeautifulSoup(get.text,"lxml")
mod = text.findAll("div","billboard-bd")
title = mod[0].findAll("td","title")

# 制作口碑URL字典
movie = {}
for i in title:
    string = str(i)
    
    # 键
    pattern_name = re.compile("mv_rk.*</a>")
    string_name = re.findall(pattern_name,string)
    string_name = "".join(string_name)

    # 值
    pattern_url = re.compile("https\://movie.douban.com/subject/[0-9]+/")
    string_url = re.findall(pattern_url,string)
    string_url = "".join(string_url)

    # 构造字典
    movie[string_name[10:-4]] = string_url

# 保存Excel文件
df = pd.DataFrame([movie,]).T
filepath = r"C:/Users/longxiaojiangi/Documents/JupyterNotebook/AutoSaveFiles/豆瓣电影一周口碑榜.xlsx"
df.to_excel(filepath,)

# 修改保存时间
app = xlwings.App(visible=False,add_book=False)
wb = app.books.open(filepath)
sheet1 = wb.sheets[0]
sheet1.range("A1").value = "电影"
sheet1.range("B1").value = "URL"
sheet1.range("A14").value = "最后一次更新时间"
sheet1.range("B14").value = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime())
wb.save()
wb.close()
print("数据已保存至:\n{}".format(filepath))
上一篇下一篇

猜你喜欢

热点阅读