爬取静态网页数据思路与案例

2018-12-23  本文已影响0人  田小田txt

爬取静态网页数据:

1.需求:

爬取什么网站获得什么内容,存储在excel、txt与MySQL数据库中。

2.分析:

3.步骤:

4.静态页面案例(百度贴吧爬取帅哥图片):

#使用urllib发起请求
from fake_useragent import UserAgent
from urllib import parse,request
import re

'''
  分析贴吧中的url地址规律,要根据url构造请求
  https://tieba.baidu.com/f?ie=utf-8&kw=帅哥&fr=search
  https://tieba.baidu.com/f?kw=%E5%B8%85%E5%93%A5&ie=utf-8&pn=50
  https://tieba.baidu.com/f?kw=%E5%B8%85%E5%93%A5&ie=utf-8&pn=100
  获取分页中帖子详情的url地址,
  从帖子详情页面中获取图片地址,
'''
  # 目标获取百度贴吧图片,并且下载到本地,
  def tbSpider(name,s_page,e_page):
      for page in range(s_page,e_page+1):
          parmars = {
              'kw':name,
              'ie':'utf-8',
              'pn':(page-1)*50
          }
      #转化为url编码格式的字符串
      result = parse.urlencode(parmars)
      full_url = 'https://tieba.baidu.com/f?'+ result
      #根据分页的地址发起请求,得到响应结果,提取页面源码
      html = load_data(full_url)
      #从页面源码中匹配出帖子详情url地址
      tiezi_urlinfo = parse_page_detail_url(html)
      for note in tiezi_urlinfo:
          # https://tieba.baidu.com/p/5981347724
          detail_url = 'https://tieba.baidu.com'+note[0]
          title = note[1]
          print('正在获取'+title+'的帖子详情')
          #根据帖子详情url地址发起请求,获得页面源码
          html = load_data(detail_url)
          #从帖子详情中,提取图片的url地址
          images = parse_detail_imageurl(html)
          download_image(images)

  def load_data(url):
      req_header ={
          'User-Agent': ua.random
      }
      req = request.Request(url,headers=req_header)
      response = request.urlopen(req)
      if response.status == 200:
          return response.read().decode('utf-8','ignore')

  def download_image(images):
     '''
      #根据图片的地址发起请求,获取图片的二进制数据,进行本地存储
      :param images:
      :return:
      '''
      for image_url in images:
           req_header = {
              'User-Agent': ua.random
            }
           req = request.Request(image_url,headers=req_header)
          response = request.urlopen(req)
          if response.status == 200:
              filename = response.url[-20:]
              with open('tieba_pic/'+filename,'wb') as file:
                  file.write(response.read())
                  print(filename,'下载完成')

  def parse_page_detail_url(html):
      '''
      使用正则,从每一个分页的html页面源码中,提取帖子详情的url地址
      :param html:获取的页面源码
      :return:
      '''
        pattern = re.compile(
          '<div\sclass="threadlist_title pull_left j_th_tit\s">'
          +'.*?<a.*?href="(.*?)".*?>(.*?)</a>.*?</div>',re.S
      )
      res = re.findall(pattern,html)
      # print(res)
      return res

  def parse_detail_imageurl(html):
      '''
      根据正则从帖子详情的html中提取图片地址
      :param html:
      :return:
      '''
      pattern = re.compile(
          '<img.*?class="BDE_Image".*?src="(.*?)".*?>',re.S
      )
      res = re.findall(pattern,html)
      #print('图片链接',res)
      return res

  if __name__ == '__main__':
      #输入贴吧的名称
      ua = UserAgent()
      name = input('请输入贴吧的名称:')
      s_page = int(input('请输入起始页:'))
      e_page = int(input('请输入结束页:'))
      tbSpider(name, s_page, e_page)
上一篇下一篇

猜你喜欢

热点阅读