程序猿阵线联盟-汇总各类技术干货

Python简单爬虫爬取虎扑社区福利gif图片

2018-02-22  本文已影响278人  盗花

以下程序亲测成功,重点代码都加了注释,就不一一介绍了。爬取结果各位自行体会。PS:爬取前先在当前文件夹建立hupu_gif的文件夹。

# coding: utf-8
from urllib.request import urlopen, urlretrieve
from bs4 import BeautifulSoup
import re
import os

url = 'https://my.hupu.com/search?q=%E7%A6%8F%E5%88%A9'  # 虎扑搜索"福利"出来的页面,注意q=后面的字符表示"福利"两个字

htmls = set()  # 用于去掉重复的网页地址
startPage = 11  # 开始页,可自行调整
endPage = 20  # 结束页,可自行调整。搜索结果总共有1000页,不能超过1000
pages = range(startPage, endPage + 1)
count = 0

for page in pages:
    url_page = url + '&page=' + str(page)  # 每个具体的搜索页
    html = urlopen(url_page)
    bsObj = BeautifulSoup(html, 'lxml')
    tds = bsObj.findAll('td', {'class': 'p_title'})  # 找到包含正确链接的td
    for td in tds:
        if td.a.attrs['href']:
            td_href = td.a.attrs['href']
            print('td_href=>', td_href)
            try:
                html_each = urlopen(td_href)
            except Exception as e:
                print(e)
                print('出错了,继续抓取下一个链接')
                continue
            bsObj_each = BeautifulSoup(html_each, 'lxml')
            gifs = bsObj_each.findAll('img', src=re.compile(r'.*gif'))  # 找到后缀为gif的动图
            if gifs:
                for gif in gifs:
                    gif_href = gif.attrs['src']
                    gif_href = re.match(r'.*gif', gif_href).group()  # 去掉'.gif'后不必要的内容
                    if gif_href not in htmls:
                        htmls.add(gif_href)
                        print(gif_href)
                        try:
                            local_filename, headers = urlretrieve(gif_href, filename='./hupu_gif/{}.gif'.format(count))
                        except Exception as e:
                            print('出错了=>', e)
                            continue
                        if os.path.getsize(local_filename) >= 10000:  # 大于10KB的gif文件保留
                            count += 1
                        else:  # 否则删除掉
                            os.remove(local_filename)

print('Done!')
上一篇下一篇

猜你喜欢

热点阅读