py爬虫4:练习之函数式编程

2022-07-22  本文已影响0人  _百草_
import os
import time
from typing import Dict,  AnyStr
from urllib import parse
from faker import Faker
from urllib.request import Request, urlopen


# 函数式修改程序
# 使得程序思路更清晰

# 拼接url
def get_url(base_url, param: Dict):
    """
    获取编码后的url
    :param base_url: 基础url
    :param param:
    :return:
    """
    return "{}?{}".format(base_url, parse.urlencode(param))


# 发送请求
def get_req(url: AnyStr):
    """
    发送请求
    :param url:
    :return:
    """
    fake = Faker(locale="zh_CN")
    ua = fake.user_agent()
    headers = {
        "User-agent": ua
    }
    req = Request(url, headers=headers)
    resp = urlopen(req)
    # text = resp.read().decode("utf-8")
    # resp_header = resp.info()  # Variable in function should be lowercase
    return resp


# 获取文件后缀
def get_extension(resp):
    """
    获取文件类型即后缀名
    :param resp:
    :return:
    """
    "Content-Type: text/html"
    content_type = resp.info()["Content-Type"]  # response Header
    if "text/html" in content_type:
        # html类型
        return ".html"
    else:
        print("不支持的类型")


# 保存文件
def save_file(resp):
    """
    返回信息(字节)保存为文件
    :param resp:
    :return:
    """
    ext = get_extension(resp)
    if not ext:
        return "不支持的文件类型"
    filename = os.path.join(os.path.dirname(__file__), f"{time.strftime('%Y%m%d%H%M%S')}{ext}")
    content = resp.read().decode("utf-8")
    with open(filename, "w", encoding="utf-8") as f:
        f.write(content)


if __name__ == "__main__":
    host = "https://www.baidu.com/s"
    word = {"wd": "百草"}
    res = get_req(get_url(host, word))
    save_file(res)

上一篇 下一篇

猜你喜欢

热点阅读