Python批量爬取百度贴吧数据
2019-10-03 本文已影响0人
黑猫编程
分析百度贴吧url
kw=python作为字典传入,pn=0为第一页,第二页pn=50,第二页pn=100
先构造出前10页url
# -*- coding: utf-8 -*-
# @Time : 2019/10/3 18:56
# @Author : 币行者
# @Email : xypip@qq.com
# @File : test5.py
import requests
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
url_list = [url_temp.format("python", i * 50) for i in range(10)]
print(url_list)
将获取到10页数据全部保存至本地
# -*- coding: utf-8 -*-
# @Time : 2019/10/3 18:56
# @Author : 币行者
# @Email : xypip@qq.com
# @File : test5.py
import requests
headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}
url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
url_list = [url_temp.format("python", i * 50) for i in range(10)]
# print(url_list)
for url in url_list:
response = requests.get(url, headers=headers)
html_str = response.content.decode()
page_num = url_list.index(url) + 1
file_path = "{}—第{}页.html".format("python", page_num)
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_str)
面向对象方法
# -*- coding: utf-8 -*-
# @Time : 2019/10/3 18:37
# @Author : 币行者
# @Email : xypip@qq.com
# @File : baidutieba_spider.py
import requests
class TiebaSpider:
def __init__(self, tieba_name, tieba_num):
self.tieba_name = tieba_name
self.tieba_num = tieba_num
self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
def get_url_list(self):
return [self.url_temp.format(i * 50) for i in range(self.tieba_num)]
def parse_url(self, url):
print(url)
response = requests.get(url, headers=self.headers)
return response.content.decode()
def save_html(self, html_str, page_num):
file_path = "{}—第{}页.html".format(self.tieba_name, page_num)
with open(file_path, "w", encoding="utf-8") as f:
f.write(html_str)
def run(self):
url_list = self.get_url_list()
for url in url_list:
html_str = self.parse_url(url)
page_num = url_list.index(url) + 1 # 页码数
self.save_html(html_str, page_num)
if __name__ == '__main__':
tieba_spider = TiebaSpider("python", 10)
tieba_spider.run()