Py进阶程序员

百度知道爬取

2018-12-23  本文已影响33人  懵懂_傻孩纸

基本功能已经实现,代码待完善,可以满足基本爬取

import requests
from lxml import etree


class ZhiDaoSpider(object):
   # 请求内容
   def __init__(self):
       self.headers = {
           "User - Agent": "Mozilla / 5.0(Windows NT 10.0;WOW64) AppleWebKit / 537.36(KHTML, likeGecko)"
                           " Chrome / 72.0.3610.2 Safari / 537.36"
       }
       self.word = input("请输入需要搜索的内容:")
       self.page = 0
       self.base_url = "https://zhidao.baidu.com/search?word=%s&pn=" % self.word
   
   # 列表页url解析
   def send_request(self, url):
       response = requests.get(url=url, headers=self.headers)
       return response
   
   # 详情页url解析
   def send_request_detail(self, url_list):

       response = requests.get(url=url_list, headers=self.headers)

       return response
   
   # 列表页解析 得出每个详情页url
   def pares_page(self, response):
       html_obj = response.content
       # html_obj = html_obj.decode("gbk")

       html = etree.HTML(html_obj)
       link_list = html.xpath("//div[@id='wgt-list']/dl/dt/a/@href")
       item_list = []
       for link in link_list:
           item_list.append(link)
       return item_list
   
   # 详情页解析  得出每个详情的内容
   def pares_details(self, details):
       html_obj = etree.HTML(details.content)
       html_list = html_obj.xpath("//div[@id='wgt-ask']/h1/span[1]/text()")
       
       # 内容标题
       for title in html_list:
           print(title)
       
       # 文章内容
       details_list = html_obj.xpath("//div[@accuse='aContent']/text()")
       print(details_list)

   def main(self):
       page = 0
       while True:
           # 拼接url字符串
           full_url = self.base_url + str(page)
           print(full_url)
           
           response = self.send_request(full_url)
           link_list = self.pares_page(response)
           print(link_list)
           
           # 得到每个详情页url
           for deta in range(len(link_list)):
               url = link_list[deta]
               print(url)
               details = self.send_request_detail(url)
               self.pares_details(details)

           page += 10
           if page == 750:
               break


if __name__ == '__main__':
   spider = ZhiDaoSpider()
   spider.main()



上一篇下一篇

猜你喜欢

热点阅读