获取user agent

2019-10-25  本文已影响0人  孤泉冷月

List of User Agent Strings
这是一个英文网站,里面记录着很多很多浏览器的user agent
写了个爬虫把这些user agent都抓了下来,然后写了个随机函数,随机获取user agent

from bs4 import BeautifulSoup as bs
from urllib import request
import json
import random


class UserAgent:
    _url = "http://useragentstring.com/pages/useragentstring.php?name=All"

    def __init__(self, cache=True, update=False):
        self._update = update
        if cache:
            self._save_as_json()

        pass

    def user_agent(self) -> list:
        try:
            user_agent = self._read_json()
        except FileNotFoundError as fnf:
            user_agent = self._get_user_agent_from_html()
        return user_agent
        pass

    def _get_user_agent_from_html(self) -> list:
        # 解析网页并获取 user agent 条目
        req = request.Request(url=self._url)
        html = request.urlopen(req).read().decode("iso-8859-1")
        soup = bs(html, "lxml")
        return [li.text for li in soup.select("#liste ul li a")]

    def _save_as_json(self):
        # 缓存为 user_agent.json 文件
        try:
            with open("user_agent.json", "x", encoding="utf-8") as fp:
                json.dump(self._get_user_agent_from_html(), fp)
        except FileExistsError as er:
            if self._update:
                with open("user_agent.json", "w", encoding="utf-8") as fp:
                    json.dump(self._get_user_agent_from_html(), fp)
        pass

    def _read_json(self) -> list:
        # 从缓存文件 user_agent.json 中读取 user_agent
        try:
            with open("user_agent.json", "r") as fp:
                user_agent = json.load(fp)
                if len(user_agent) > 0:
                    return user_agent
                else:
                    self._update = True
                    self._save_as_json()
                    raise FileNotFoundError
        except FileNotFoundError as er:
            raise FileNotFoundError
            pass
        pass

    def random(self) -> str:
        return random.choice(self.user_agent())
        pass


if __name__ == '__main__':
    ua = UserAgent()
    print(ua.random())
    pass

上一篇下一篇

猜你喜欢

热点阅读