Python 爬虫 BeautifulSoup 代码实例
2023-05-17 本文已影响0人
我的小小笔尖
完整代码,获取小学英语资源,包括单词信息(JSON文件)和音频信息(MP3文件)
如果代码执行中断,如超时,访问被拒绝,可设置从断点出继续下载,需重新执行python代码
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import lxml
import os
import math
from datetime import date
import time
import json
gl_base_url = 'http://book.qsbdc.com/'
gl_is_continue_download = False
# 请求
def requestByUrl(url):
resp = requests.get(url)
time.sleep(1) # 休眠,避免访问太频繁
if(resp.status_code==200):
# 成功,返回
return resp
else:
None
# 获取分页链接列表
# 示例:http://book.qsbdc.com/book_list.php?class_id=1&&page_id=1
def getPages(url, level_name):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
options_list = soup.select('tr > td > select > option')
for option in options_list:
page_url = url + '&page_id=' + option['value']
print (page_url)
# 获取书本链接列表
getBooks(page_url, level_name)
# 获取书本链接列表
# 示例:http://book.qsbdc.com/book_info.php?book_id=1753
# 示例:http://book.qsbdc.com/word_list.php?book_id=1753
def getBooks(url, level_name):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
options_list = soup.select('tr > td > span > a')
global gl_base_url
for option in options_list:
book_url = gl_base_url + option['href']
book_url = book_url.replace('book_info', 'word_list')
book_name = option.getText().strip()
print (book_url, book_name)
# 是否使用断点下载
isUserBreakPoint = False
if(isUserBreakPoint):
# 设置从哪本书继续下载
global gl_is_continue_download
continue_download_book_name = '《新编小学英语》第一册'
if(book_name == continue_download_book_name):
gl_is_continue_download = True
print ('===============继续下载===============')
if gl_is_continue_download:
# 获取单元链接列表
getUnits(book_url, level_name, book_name)
else:
# 获取单元链接列表
getUnits(book_url, level_name, book_name)
# 获取书本链接列表
# 示例:http://book.qsbdc.com/word_list.php?tag=all&book_id=1421&group_id=19122
def getUnits(url, level_name, book_name):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
options_list = soup.select('div > div > .enbook_group_l > li > a')
global gl_base_url
for option in options_list:
unit_url = gl_base_url + 'word_list.php' + option['href']
unit_name = 'Unit' + option.getText().split('[')[0]
print (unit_url, unit_name)
# 获取单词列表分页链接列表
getWordListPages(unit_url, level_name, book_name, unit_name)
# 获取单词列表分页链接列表
# 示例:http://book.qsbdc.com/word_list.php?book_id=582&tag=all&&group_id=10251&page_id=2
def getWordListPages(url, level_name, book_name, unit_name):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
options_list = soup.select('tr > td > select > option')
word_lists = []
for option in options_list:
wordListPage_url = url + '&page_id=' + option['value']
# print (wordListPage_url)
# 获取页面单词列表
word_list = getWords(wordListPage_url, level_name, book_name, unit_name)
word_lists.extend(word_list)
# 写入JSON文件
savePath = './word_list/'+level_name+'/'+book_name+'/'
filename = level_name + book_name + unit_name + '.json'
json_obj = { "wordlist" : word_lists }
str_obj = json.dumps(json_obj, ensure_ascii=False)
writeJsonFile(savePath, filename, str_obj)
# 获取页面单词列表
# 示例:http://sound.yywz123.com/qsbdcword/G/GhdxC.mp3
def getWords(url, level_name, book_name, unit_name):
resp = requestByUrl(url)
content = resp.content
soup = BeautifulSoup(content, 'lxml')
options_list = soup.select('.table_solid > tr')
word_list = []
for opIdx, option in enumerate(options_list):
if(opIdx > 1):
arr_word = []
tds = option.select('tr > td')
# 如果列数小于8,则不是单词行
if(len(tds) < 8):
continue
word = tds[2].getText().strip()
arr_word.append(word)
phonetic = tds[3].getText().strip()
arr_word.append(phonetic)
# 如有音频文件,则正常命名;如无音频文件,则名称为空
audio_filename = ''
if(tds[4].select('a')):
audio_filename = tds[4].select('a')[0]['name'] + '.mp3'
arr_word.append(audio_filename)
chinese = tds[5].getText().strip()
arr_word.append(chinese)
word_list.append(arr_word)
# print (word, phonetic, chinese)
# 如有音频文件,则下载
if (tds[4].select('a')):
audio_url = 'http://sound.yywz123.com/qsbdcword/' + audio_filename[0] + '/' + audio_filename
savePath = './audio_list/'+level_name+'/'+book_name+'/'+unit_name+'/'
if os.path.exists(os.path.join(savePath, audio_filename)):
# 文件已经存在则忽略
print ('文件已存在', word, audio_filename)
else:
downloadAudioFile(audio_url, savePath, audio_filename)
return word_list
# 写入文件
def writeJsonFile(savePath, filename, str_obj):
# 文件夹不存在,则创建文件夹
folder = os.path.exists(savePath)
if not folder:
os.makedirs(savePath)
jsonFilePath = os.path.join(savePath, filename)
print (time.strftime('%Y-%m-%d %H-%M-%S'), '写入JSON文件:' + filename)
with open(jsonFilePath, 'w', encoding='utf-8')as file:
file.write(str_obj)
file.close()
# 下载音频文件
def downloadAudioFile(downloadUrl, savePath, filename):
# 文件夹不存在,则创建文件夹
folder = os.path.exists(savePath)
if not folder:
os.makedirs(savePath)
# 获取mp3资源
mp3Res = requests.get(downloadUrl, stream=True, timeout=(60, 120))
mp3FilePath = os.path.join(savePath, filename)
# print ('保存音频文件:' + mp3FilePath)
# 保存
try:
with open(mp3FilePath, 'wb') as fd:
for chunk in mp3Res.iter_content():
fd.write(chunk)
except requests.exceptions.ConnectTimeout as tmot:
print ('ConnectTimeout')
except Exception as e:
print(e)
# time.sleep(0.5) # 休眠,避免访问太频繁
# 主函数
def main():
index_urls = [
['http://book.qsbdc.com/book_list.php?class_id=1', '小学英语'],
['http://book.qsbdc.com/book_list.php?class_id=2', '初中英语'],
['http://book.qsbdc.com/book_list.php?class_id=3', '高中英语'],
]
for index_url in index_urls:
# 获取分页链接列表
getPages(index_url[0], index_url[1])
if __name__ == '__main__':
main()