爬虫案例

2022-07-03  本文已影响0人  开心的小哈

实战巩固

  1. 爬取搜狗指定页面的数据-待完成
import requests

heads = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"}


def get_url():
    kw = input('请输入检索关键字')
    return "https://cn.bing.com/search?q=" + kw


def get_data(url):
    res = requests.get(url, headers=heads)
    save_file("检索结果数据", res.text)


def save_file(name, data):
    with open(name + ".html", "w", encoding="utf-8") as wf:
        wf.write(data)


if __name__ == "__main__":
    url = get_url()
    get_data(url)

  1. 爬取破解百度翻译
import requests
import urllib.parse as parse
import json

def fanyi(kw):
    url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        'Cookie': 'REALTIME_TRANS_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_PREFER_SWITCH=1; APPGUIDE_10_0_2=1; BIDUPSID=6A0B90F549B4E722A96A29666574A81B; PSTM=1653639263; BAIDUID=6A0B90F549B4E722B61B0337963B4817:FG=1; BAIDUID_BFESS=6A0B90F549B4E722B61B0337963B4817:FG=1; ZFY=NmGJc7JlHfQ:BLiuJWcMARBy:BusCUodzUtsi4qGc2tfQ:C; BAIDU_WISE_UID=wapp_1655032828797_134; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1655299374,1655300476,1655300668,1655734579; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1655734579; ab_sr=1.0.1_OTFhOWFiNmI5NzQyMDY0OTQwZGIwMDE5OTRiM2M1Y2I3OTlmOTRhMWQ0MGRiMjMwYzU2MjJjOGUyYWZiYzJmNmYyYjU0MTE0ODU1MGI2NTdkOTI0OGFjMDlmYTg2NTBkODU5MmE0NWE3MzM1ZjE2OGVhNDY1MzRjNjhhMmQzNzZmNjAyZWQxYzI1ZDkwNjdlZjI3M2MzMDE4OWYzN2FkNQ==',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'
        }
    data = {'from': 'en', 'to': 'zh', 'query': kw, 'transtype': "realtime", 'simple_means_flag': "3",
            "sign": '830707.544706', 'token': '98cd61560d5388bcc7d0ff60c08c4158', 'domain': 'common'}
    # data = parse.urlencode(data).encode()
    # print(data)
    # data=f"from=en&to=zh&query={kw}&transtype=realtime&simple_means_flag=3&sign=830707.544706&token=98cd61560d5388bcc7d0ff60c08c4158&domain=common"

    print(data)
    res = requests.post(url, headers=headers,data=data)
    # print(res.text.encode('utf-8').decode("unicode_escape"))
    # page_text.content.decode('utf-8') # 将返回的内容进行utf-8编码
    # print(res.content.decode('unicode-escape'))  # 将unicode编码进行转码显示中文汉字
    list=res.json()
    print()
    fp=open("./11.txt","w",encoding="utf-8")
    json.dump(res.json(),fp=fp,ensure_ascii=False)

kw = input('please input key')
fanyi(kw)

  1. 爬取豆瓣电影分类排行榜
import json
import requests

headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}

def Test01():
    url = "https://movie.douban.com/j/chart/top_list"
    param = {'type': '24',
             'interval_id': '100:90',
             'action': '',
             'start': '10',
             'limit': '20'
             }

    res=requests.get(url, params=param, headers=headers)
    list_data=res.json()
    print(list_data)
    fp=open('./douban.json',"w",encoding='utf-8')
    json.dump(list_data,fp=fp,ensure_ascii=False)

def getKDJ():
    ke = input('请输入城市')
    a=1
    while True:
        url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
        data={'cname':"",'pid':'',"keyword":ke,"pageIndex":a}
        res=requests.post(url,headers=headers,data=data)
        print(res.text)
        data_list=json.dumps(res.text,ensure_ascii=False)

        print(type(data_list),data_list)
        a+=1
        commit=input("是否查看下一页数据,是进行/否推出")
        if commit!="是":
            return None
        else:
            res = requests.post(url, headers=headers, data=data)
            print(res.text)
            data_list = json.dumps(res.text, ensure_ascii=False)
            print(type(data_list), data_list)

def getGJJG():
    url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?hKHnQfLv=53BHcvFf57UciNYtofCPsA9FO16baEIXUe19bRBXhg7rBlmtCHnsik_byn0PzLSY3bVUkihuxWe_sgkAX1WZu_1ybEqU5PPA8hR28JBI5590cYX5rUp16.UNrukVygjFWnB30adCTRLG8jFAp34jFBTtUzms3I0GZlZnxHGJd6HNNWBc_rsje99ao6.US098joA5m4._S2_rOpW2K4U5gu_ojQwSTPniQeOCJRMcaSNX2JDrrhKQNHKHt7Dm6iB_9St26DTwNP5.6TPTVnXNYAngMOkdQoWtp2ClluzSlM3yYvV4SEqFvBQW2JAyrd5ttfZc2rBIpwKA902YkpzXr60lQnJgQo6kbc4L7JK4P94l&8X7Yi61c=4w_361nsYEBznepRcSH0pxcubexKO5Vosw.LtgenPXT_Ik.uhuVjgKDrUG9OVVt97Oo9eEmGIUB9yUqnErd5hJqL1TUMki1bYTFHRHZoNTE5tDPeKYcioTBdHtBMEpkNu'
    data={'on':'true','page':1,"pageSize":15,"productName":"","conditionType":1,"applyname":""}

    res=requests.post(url,data=data,headers=headers)
    print(res.text)







  1. 爬取肯德基餐厅查询地址


  1. 爬取国家药品监管总局相关数据
  2. bs4的古诗文定位练习
from bs4 import BeautifulSoup
import requests

def get_data():
    headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}
    url='https://www.shicimingju.com/book/sanguoyanyi.html'
    page_text=requests.get(url,headers=headers)
    # 在首页中解析出章节和详情页标题
    page_text=page_text.content.decode('utf-8')
    soup=BeautifulSoup(page_text,'lxml')
    # 解析章节标题和详情页的URL
    li_list=soup.select('div.book-mulu > ul > li')
    fp=open('./sango.txt','w',encoding='utf-8')
    for li in li_list:
        title=li.a.string
        content_url='https://www.shicimingju.com/'+li.a['href']
        # 对详情页发起请求,解析出章节内容来
        detail_page_text=requests.get(content_url,headers=headers).content.decode('utf-8')
        # 解析出详情页中相关的内容
        datail_soup=BeautifulSoup(detail_page_text,'lxml')

        div_tag=datail_soup.find('div',class_='chapter_content')
        content=div_tag.get_text()
        fp.write(title+':'+content+'\n')
        print(content,'爬取成功')

get_data()


Xpath练习1

# Xpth 解析城市案例
import requests
from lxml import etree

headers = {
    'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37'
    ,'Host':'www.aqistudy.cn'


}

filePath='D:\PyTest\XpathDemo1\ddd.html'
def get_data():
    # url = 'http://www.aqistudy.cn/historydata/'
    # res = requests.get(url=url, headers=headers)
    # data = res.text
    #
    # tree = etree.HTML(data)
    #   使用本地的文件进行上传测试
    # with open('./ddd.html','r',encoding='utf-8') as wf:
    #    s= wf.read()

    tree = etree.parse(filePath, etree.HTMLParser())
    # tree = etree.HTML(filePath)
    host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
    all_ctty_names = []
    for li in host_li_list:
        name = li.xpath('./a/text()')[0]
        all_ctty_names.append(name)

    ctiy_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
    for li in ctiy_names_list:
        cty_name = li.xpath('./a/text()')[0]
        all_ctty_names.append(cty_name)

    print(all_ctty_names, len(all_ctty_names))

def get_data2():
    tree = etree.parse(filePath, etree.HTMLParser())
    ctty_names=tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
    print(ctty_names,len(ctty_names))

if __name__ == "__main__":
    get_data2()


Xpath练习2

  1. 将免费的简历模板进行下载

验证码打码
识别古诗文网页登陆页面中的验证码地址:登录古诗文网 (gushiwen.cn)

  1. 将验证码图片进行本地下载
  2. 调用平台提供的代码进行图片数据识别

多线程-线程池进行下载

import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44'}
url = 'https://www.pearvideo.com/category_8'
page_text = requests.get(url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls_video=[]
for li in li_list:
    detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
    name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
    # 对详情页发起url的请求
    info_page = detail_url.split('_')[-1]
    inofo_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + info_page + '&mrd=0.20124002223369164'
    headers['Referer'] = detail_url
    detail_page_text = requests.get(url=inofo_url, headers=headers).json()
    video = detail_page_text.get('videoInfo').get('videos').get('srcUrl')
    d_video = 'cont-%s' % (info_page)
    lis = re.split('-|/', video)[6]
    # 获取视频地址
    video_path = video.replace(lis, d_video)
    dic={
        'name':name,
        'url':video_path
    }
    urls_video.append(dic)

# 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
def get_video_data(video_dic):
    url=video_dic['url']
    name = video_dic['name']
    name = re.sub('[\/:*?"<>|]', '-', name)  # 去掉非法字符
    print(name, '正在下载...')
    res=requests.get(url,headers=headers).content
    with open(name,'wb') as wf:
        wf.write(res)
        print(name,'下载成功!!!')

pool=Pool(4)

pool.map(get_video_data,urls_video)

pool.close()
pool.join()

12306网站登录

上一篇下一篇

猜你喜欢

热点阅读