爬虫案例
2022-07-03 本文已影响0人
开心的小哈
实战巩固
- 爬取搜狗指定页面的数据-待完成
import requests
heads = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44"}
def get_url():
kw = input('请输入检索关键字')
return "https://cn.bing.com/search?q=" + kw
def get_data(url):
res = requests.get(url, headers=heads)
save_file("检索结果数据", res.text)
def save_file(name, data):
with open(name + ".html", "w", encoding="utf-8") as wf:
wf.write(data)
if __name__ == "__main__":
url = get_url()
get_data(url)
- 爬取破解百度翻译
import requests
import urllib.parse as parse
import json
def fanyi(kw):
url = 'https://fanyi.baidu.com/v2transapi?from=en&to=zh'
headers = {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie': 'REALTIME_TRANS_SWITCH=1; SOUND_SPD_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; SOUND_PREFER_SWITCH=1; APPGUIDE_10_0_2=1; BIDUPSID=6A0B90F549B4E722A96A29666574A81B; PSTM=1653639263; BAIDUID=6A0B90F549B4E722B61B0337963B4817:FG=1; BAIDUID_BFESS=6A0B90F549B4E722B61B0337963B4817:FG=1; ZFY=NmGJc7JlHfQ:BLiuJWcMARBy:BusCUodzUtsi4qGc2tfQ:C; BAIDU_WISE_UID=wapp_1655032828797_134; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1655299374,1655300476,1655300668,1655734579; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1655734579; ab_sr=1.0.1_OTFhOWFiNmI5NzQyMDY0OTQwZGIwMDE5OTRiM2M1Y2I3OTlmOTRhMWQ0MGRiMjMwYzU2MjJjOGUyYWZiYzJmNmYyYjU0MTE0ODU1MGI2NTdkOTI0OGFjMDlmYTg2NTBkODU5MmE0NWE3MzM1ZjE2OGVhNDY1MzRjNjhhMmQzNzZmNjAyZWQxYzI1ZDkwNjdlZjI3M2MzMDE4OWYzN2FkNQ==',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'
}
data = {'from': 'en', 'to': 'zh', 'query': kw, 'transtype': "realtime", 'simple_means_flag': "3",
"sign": '830707.544706', 'token': '98cd61560d5388bcc7d0ff60c08c4158', 'domain': 'common'}
# data = parse.urlencode(data).encode()
# print(data)
# data=f"from=en&to=zh&query={kw}&transtype=realtime&simple_means_flag=3&sign=830707.544706&token=98cd61560d5388bcc7d0ff60c08c4158&domain=common"
print(data)
res = requests.post(url, headers=headers,data=data)
# print(res.text.encode('utf-8').decode("unicode_escape"))
# page_text.content.decode('utf-8') # 将返回的内容进行utf-8编码
# print(res.content.decode('unicode-escape')) # 将unicode编码进行转码显示中文汉字
list=res.json()
print()
fp=open("./11.txt","w",encoding="utf-8")
json.dump(res.json(),fp=fp,ensure_ascii=False)
kw = input('please input key')
fanyi(kw)
- 爬取豆瓣电影分类排行榜
import json
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}
def Test01():
url = "https://movie.douban.com/j/chart/top_list"
param = {'type': '24',
'interval_id': '100:90',
'action': '',
'start': '10',
'limit': '20'
}
res=requests.get(url, params=param, headers=headers)
list_data=res.json()
print(list_data)
fp=open('./douban.json',"w",encoding='utf-8')
json.dump(list_data,fp=fp,ensure_ascii=False)
def getKDJ():
ke = input('请输入城市')
a=1
while True:
url='http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword'
data={'cname':"",'pid':'',"keyword":ke,"pageIndex":a}
res=requests.post(url,headers=headers,data=data)
print(res.text)
data_list=json.dumps(res.text,ensure_ascii=False)
print(type(data_list),data_list)
a+=1
commit=input("是否查看下一页数据,是进行/否推出")
if commit!="是":
return None
else:
res = requests.post(url, headers=headers, data=data)
print(res.text)
data_list = json.dumps(res.text, ensure_ascii=False)
print(type(data_list), data_list)
def getGJJG():
url='http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?hKHnQfLv=53BHcvFf57UciNYtofCPsA9FO16baEIXUe19bRBXhg7rBlmtCHnsik_byn0PzLSY3bVUkihuxWe_sgkAX1WZu_1ybEqU5PPA8hR28JBI5590cYX5rUp16.UNrukVygjFWnB30adCTRLG8jFAp34jFBTtUzms3I0GZlZnxHGJd6HNNWBc_rsje99ao6.US098joA5m4._S2_rOpW2K4U5gu_ojQwSTPniQeOCJRMcaSNX2JDrrhKQNHKHt7Dm6iB_9St26DTwNP5.6TPTVnXNYAngMOkdQoWtp2ClluzSlM3yYvV4SEqFvBQW2JAyrd5ttfZc2rBIpwKA902YkpzXr60lQnJgQo6kbc4L7JK4P94l&8X7Yi61c=4w_361nsYEBznepRcSH0pxcubexKO5Vosw.LtgenPXT_Ik.uhuVjgKDrUG9OVVt97Oo9eEmGIUB9yUqnErd5hJqL1TUMki1bYTFHRHZoNTE5tDPeKYcioTBdHtBMEpkNu'
data={'on':'true','page':1,"pageSize":15,"productName":"","conditionType":1,"applyname":""}
res=requests.post(url,data=data,headers=headers)
print(res.text)
- 爬取肯德基餐厅查询地址
- 爬取国家药品监管总局相关数据
- bs4的古诗文定位练习
from bs4 import BeautifulSoup
import requests
def get_data():
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.44'}
url='https://www.shicimingju.com/book/sanguoyanyi.html'
page_text=requests.get(url,headers=headers)
# 在首页中解析出章节和详情页标题
page_text=page_text.content.decode('utf-8')
soup=BeautifulSoup(page_text,'lxml')
# 解析章节标题和详情页的URL
li_list=soup.select('div.book-mulu > ul > li')
fp=open('./sango.txt','w',encoding='utf-8')
for li in li_list:
title=li.a.string
content_url='https://www.shicimingju.com/'+li.a['href']
# 对详情页发起请求,解析出章节内容来
detail_page_text=requests.get(content_url,headers=headers).content.decode('utf-8')
# 解析出详情页中相关的内容
datail_soup=BeautifulSoup(detail_page_text,'lxml')
div_tag=datail_soup.find('div',class_='chapter_content')
content=div_tag.get_text()
fp.write(title+':'+content+'\n')
print(content,'爬取成功')
get_data()
Xpath练习1
# Xpth 解析城市案例
import requests
from lxml import etree
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.53 Safari/537.36 Edg/103.0.1264.37'
,'Host':'www.aqistudy.cn'
}
filePath='D:\PyTest\XpathDemo1\ddd.html'
def get_data():
# url = 'http://www.aqistudy.cn/historydata/'
# res = requests.get(url=url, headers=headers)
# data = res.text
#
# tree = etree.HTML(data)
# 使用本地的文件进行上传测试
# with open('./ddd.html','r',encoding='utf-8') as wf:
# s= wf.read()
tree = etree.parse(filePath, etree.HTMLParser())
# tree = etree.HTML(filePath)
host_li_list = tree.xpath('//div[@class="bottom"]/ul/li')
all_ctty_names = []
for li in host_li_list:
name = li.xpath('./a/text()')[0]
all_ctty_names.append(name)
ctiy_names_list = tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
for li in ctiy_names_list:
cty_name = li.xpath('./a/text()')[0]
all_ctty_names.append(cty_name)
print(all_ctty_names, len(all_ctty_names))
def get_data2():
tree = etree.parse(filePath, etree.HTMLParser())
ctty_names=tree.xpath('//div[@class="bottom"]/ul/li/a/text() | //div[@class="bottom"]/ul/div[2]/li/a/text()')
print(ctty_names,len(ctty_names))
if __name__ == "__main__":
get_data2()
Xpath练习2
- 将免费的简历模板进行下载
验证码打码
识别古诗文网页登陆页面中的验证码地址:登录古诗文网 (gushiwen.cn)
- 将验证码图片进行本地下载
- 调用平台提供的代码进行图片数据识别
多线程-线程池进行下载
import requests
from lxml import etree
import re
from multiprocessing.dummy import Pool
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.5060.66 Safari/537.36 Edg/103.0.1264.44'}
url = 'https://www.pearvideo.com/category_8'
page_text = requests.get(url, headers=headers).text
tree = etree.HTML(page_text)
li_list = tree.xpath('//ul[@id="listvideoListUl"]/li')
urls_video=[]
for li in li_list:
detail_url = 'https://www.pearvideo.com/' + li.xpath('./div/a/@href')[0]
name = li.xpath('./div/a/div[2]/text()')[0] + '.mp4'
# 对详情页发起url的请求
info_page = detail_url.split('_')[-1]
inofo_url = 'https://www.pearvideo.com/videoStatus.jsp?contId=' + info_page + '&mrd=0.20124002223369164'
headers['Referer'] = detail_url
detail_page_text = requests.get(url=inofo_url, headers=headers).json()
video = detail_page_text.get('videoInfo').get('videos').get('srcUrl')
d_video = 'cont-%s' % (info_page)
lis = re.split('-|/', video)[6]
# 获取视频地址
video_path = video.replace(lis, d_video)
dic={
'name':name,
'url':video_path
}
urls_video.append(dic)
# 使用线程池对视频数据进行请求(较为耗时的阻塞操作)
def get_video_data(video_dic):
url=video_dic['url']
name = video_dic['name']
name = re.sub('[\/:*?"<>|]', '-', name) # 去掉非法字符
print(name, '正在下载...')
res=requests.get(url,headers=headers).content
with open(name,'wb') as wf:
wf.write(res)
print(name,'下载成功!!!')
pool=Pool(4)
pool.map(get_video_data,urls_video)
pool.close()
pool.join()