百度知道爬取
2018-12-23 本文已影响33人
懵懂_傻孩纸
基本功能已经实现,代码待完善,可以满足基本爬取
import requests
from lxml import etree
class ZhiDaoSpider(object):
# 请求内容
def __init__(self):
self.headers = {
"User - Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko)"
" Chrome / 72.0.3610.2 Safari / 537.36"
}
self.word = input("请输入需要搜索的内容:")
self.page = 0
self.base_url = "https://zhidao.baidu.com/search?word=%s&pn=" % self.word
# 列表页url解析
def send_request(self, url):
response = requests.get(url=url, headers=self.headers)
return response
# 详情页url解析
def send_request_detail(self, url_list):
response = requests.get(url=url_list, headers=self.headers)
return response
# 列表页解析 得出每个详情页url
def pares_page(self, response):
html_obj = response.content
# html_obj = html_obj.decode("gbk")
html = etree.HTML(html_obj)
link_list = html.xpath("//div[@id='wgt-list']/dl/dt/a/@href")
item_list = []
for link in link_list:
item_list.append(link)
return item_list
# 详情页解析 得出每个详情的内容
def pares_details(self, details):
html_obj = etree.HTML(details.content)
html_list = html_obj.xpath("//div[@id='wgt-ask']/h1/span[1]/text()")
# 内容标题
for title in html_list:
print(title)
# 文章内容
details_list = html_obj.xpath("//div[@accuse='aContent']/text()")
print(details_list)
def main(self):
page = 0
while True:
# 拼接url字符串
full_url = self.base_url + str(page)
print(full_url)
response = self.send_request(full_url)
link_list = self.pares_page(response)
print(link_list)
# 得到每个详情页url
for deta in range(len(link_list)):
url = link_list[deta]
print(url)
details = self.send_request_detail(url)
self.pares_details(details)
page += 10
if page == 750:
break
if __name__ == '__main__':
spider = ZhiDaoSpider()
spider.main()