py爬虫6:程序化编程

2022-07-25  本文已影响0人  _百草_

1、程序化结构

# 程序结构
class xxxSpider(object):
    def __init__(self):
        # 定义常用变量,比如url或计数变量等
       
    def get_html(self):
        # 获取响应内容函数,使用随机User-Agent
   
    def parse_html(self):
        # 使用正则表达式来解析页面,提取数据
   
    def write_html(self):
        # 将提取的数据按要求保存,csv、MySQL数据库等
       
    def run(self):
        # 主函数,用来控制整体逻辑
       
if __name__ == '__main__':
    # 程序开始运行时间
    spider = xxxSpider()
    spider.run()

2、练习

# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_spider.py
@time:2022/7/25
"""
import os.path
import random
from urllib.request import Request, urlopen
from urllib import parse
import time
from faker import Faker


# 定义一个爬虫类
class TiebaSpider:

    # 初始化url属性
    def __init__(self):
        self.url = "https://tieba.baidu.com/f?{}"

    # 1、请求函数,得到页面,传统3部
    def get_html(self, url):
        fake = Faker(locale="zh_CN")
        ua = fake.user_agent()
        req = Request(url, headers={"User-Agent": ua})
        res = urlopen(req)
        html = res.read().decode("utf-8")  # 避免中文乱码
        return html

    # 2、解析函数
    def parae_html(self):
        pass

    # 3、保存文件函数
    def save_html(self, filename, html):
        with open(filename, "w", encoding="utf-8") as f:
            f.write(html)
            # UnicodeEncodeError: 'gbk' codec can't encode character '\xa9' in position 5283: illegal multibyte sequence
            # 添加encoding='utf-8'

    # 4、入口函数
    def run(self):
        name = input("请输入贴吧名:")
        begin = int(input("请输入起始页:"))
        stop = int(input("请输入终止页:"))
        params = {
            "ie": "utf-8",
            "kw": name
        }
        # +1 保证能够取到stop
        for page in range(begin, stop + 1):
            # 处理请求参数
            pn = (page - 1) * 50  # 开始条数;50条一页
            if pn == 0:
                params["fr"] = "search"
            else:
                params["pn"] = pn
            # 拼接ul
            url = self.url.format(parse.urlencode(params))
            # 发起请求
            html = self.get_html(url)
            # 定义文件名及路径
            basedir = os.path.dirname(__file__)
            filename = os.path.join(os.path.join(basedir, "html"), "{}_{}页.html".format(name, page))
            # 保存文件
            self.save_html(filename, html)
            # 打印日志
            print(f"第{page}页抓取成功!")
            # 等待:每爬取一个页面随机休眠1-2秒的时间
            time.sleep(random.randint(1, 2))  # 包括首尾值


if __name__ == "__main__":
    start = time.time()
    spider = TiebaSpider()  # 实例化一个对象
    spider.run()  # 调用入口函数
    end = time.time()
    # 查看程序执行时间
    print(f"执行时间:{end-start}秒")

3、随机休眠

time.sleep(random.randint(1, 2))爬虫程序访问网站会非常快,这与正常人类的点击行为非常不符。因此,通过随机休眠可以使爬虫程序模仿成人类的样子点击网站,从而让网站不易察觉是爬虫访问网站,但这样做的代价就是影响程序的执行效率。

参考

Python爬虫抓取百度贴吧数据

个人练习

# -*- coding:utf-8 -*-
"""
@author:百草Lily
@file:test_tieba.py
@time:2022/7/22
"""
from urllib import parse
import os
import time
from urllib.request import Request, urlopen
from faker import Faker
from typing import AnyStr

# https://tieba.baidu.com/f?ie=utf-8&kw=ces&fr=search
# https://tieba.baidu.com/f?kw=ces&ie=utf-8&pn=50
# https://tieba.baidu.com/f?kw=ces&ie=utf-8&pn=100

word = input("请输入希望在贴吧搜索的内容:")
urls = []  # 待搜索url

base_url = "https://tieba.baidu.com/f"
for i in range(0, 201, 50):  # 前5页 0,50,100,150,200
    params = {
        "ie": "utf-8",
        "kw": word
    }
    if i == 0:
        params["fr"] = "search"
    else:
        params["pn"] = i
    urls.append("{}?{}".format(base_url, parse.urlencode(params)))
print(urls)


# <a rel="noopener" href="/p/6689385508" title="开学了" target="_blank" class="j_th_tit ">开学了</a>
# <a rel="noopener" href="/p/7849277087" title="勇士第四节后段换回主力柯尔" target="_blank" class="j_th_tit ">勇士第四节</a>
# 发送请求
def get_req(url: AnyStr):
    """
    发送请求
    :param url:
    :return:
    """
    fake = Faker(locale="zh_CN")
    ua = fake.user_agent()
    headers = {
        "User-agent": ua
    }
    req = Request(url, headers=headers)
    resp = urlopen(req)
    # text = resp.read().decode("utf-8")
    # resp_header = resp.info()  # Variable in function should be lowercase
    return resp


# 获取文件后缀
def get_extension(resp):
    """
    获取文件类型即后缀名
    :param resp:
    :return:
    """
    "Content-Type: text/html"
    content_type = resp.info()["Content-Type"]  # response Header
    if "text/html" in content_type:
        # html类型
        return ".html"
    else:
        print("不支持的类型")


# 保存文件
def save_file(resp):
    """
    返回信息(字节)保存为文件
    :param resp:
    :return:
    """
    ext = get_extension(resp)
    if not ext:
        return "不支持的文件类型"
    filename = os.path.join(os.path.dirname(__file__), f"{time.strftime('%Y%m%d%H%M%S')}{ext}")
    content = resp.read().decode("utf-8")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)
    return filename


filenames = []  # 存储文件
for url in urls:
    resp = get_req(url)
    filenames.append(save_file(resp))

# 解析
from pyquery import PyQuery

for file in filenames:
    with open(file, encoding="utf-8") as f:
        content = f.read()
    doc = PyQuery(content)
    tags = doc(".j_th_tit")  # .class名,获取元素;返回list类型
    # text = [tag.text() for tag in tags]  # 获取所有元素的文本
for tag in tags:
    # print(tag.text()) # TypeError: 'str' object is not callable
    with open("res.txt", "w+", encoding="utf-8") as f:
        f.write(tag.text)

上一篇下一篇

猜你喜欢

热点阅读