Python批量爬取百度贴吧数据

2019-10-03  本文已影响0人  黑猫编程

分析百度贴吧url

kw=python作为字典传入,pn=0为第一页,第二页pn=50,第二页pn=100

先构造出前10页url

# -*- coding: utf-8 -*-
# @Time    : 2019/10/3 18:56
# @Author  : 币行者
# @Email   : xypip@qq.com
# @File    : test5.py

import requests

headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
url_list = [url_temp.format("python", i * 50) for i in range(10)]

print(url_list)

将获取到10页数据全部保存至本地

# -*- coding: utf-8 -*-
# @Time    : 2019/10/3 18:56
# @Author  : 币行者
# @Email   : xypip@qq.com
# @File    : test5.py

import requests

headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"}

url_temp = "https://tieba.baidu.com/f?kw={}&ie=utf-8&pn={}"
url_list = [url_temp.format("python", i * 50) for i in range(10)]

# print(url_list)

for url in url_list:
    response = requests.get(url, headers=headers)
    html_str = response.content.decode()
    page_num = url_list.index(url) + 1
    file_path = "{}—第{}页.html".format("python", page_num)
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(html_str)

面向对象方法

# -*- coding: utf-8 -*-
# @Time    : 2019/10/3 18:37
# @Author  : 币行者
# @Email   : xypip@qq.com
# @File    : baidutieba_spider.py

import requests

class TiebaSpider:

    def __init__(self, tieba_name, tieba_num):
        self.tieba_name = tieba_name
        self.tieba_num = tieba_num
        self.url_temp = "https://tieba.baidu.com/f?kw=" + tieba_name + "&ie=utf-8&pn={}"
        self.headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}

    def get_url_list(self):
        return [self.url_temp.format(i * 50) for i in range(self.tieba_num)]

    def parse_url(self, url):
        print(url)
        response = requests.get(url, headers=self.headers)
        return response.content.decode()

    def save_html(self, html_str, page_num):
        file_path = "{}—第{}页.html".format(self.tieba_name, page_num)
        with open(file_path, "w", encoding="utf-8") as f:
            f.write(html_str)

    def run(self):
        url_list = self.get_url_list()
        for url in url_list:
            html_str = self.parse_url(url)
            page_num = url_list.index(url) + 1  # 页码数
            self.save_html(html_str, page_num)



if __name__ == '__main__':

    tieba_spider = TiebaSpider("python", 10)
    tieba_spider.run()
上一篇下一篇

猜你喜欢

热点阅读