爬图片
2019-08-06 本文已影响0人
Vincy_ivy
根据txt爬图片
import os
from icrawler.builtin import BingImageCrawler
path = r'D:\pycharm_1\Image'
f = open('starName.txt', 'r')
lines = f.readlines()
i=18
for i, line in enumerate(lines):
if i>25:
break
if i==1:
continue
name = line.strip('\n')
file_path = os.path.join(path, name)
# if not os.path.exists(file_path):
# os.makedirs(file_path)
bing_storage = {'root_dir': file_path}
bing_crawler = BingImageCrawler(parser_threads=2, downloader_threads=4, storage=bing_storage)
bing_crawler.crawl(keyword=name,max_num=50)
print('第{}位明星:{}'.format(i, name))
进入网站根据名字爬图片
import requests
import re
import os
from pypinyin import pinyin, lazy_pinyin
def getHTMLText(url):
try:
r = requests.get(url, timeout=30)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print("")
def getPageUrls(text, name):
re_pageUrl = r'href="(.+)">\s*<img src="(.+)" alt="' + name
return re.findall(re_pageUrl, text)
def downPictures(text, root, name):
pageUrls = getPageUrls(text, name)
titles = re.findall(r'alt="' + name + r'(.+)" ', text)
for i in range(len(pageUrls)):
pageUrl = pageUrls[i][0]
path = root + titles[i] + "//"
if not os.path.exists(path):
os.mkdir(path)
if not os.listdir(path):
pageText = getHTMLText(pageUrl)
totalPics = int(re.findall(r'<em>(.+)</em>)', pageText)[0])
downUrl = re.findall(r'href="(.+?)" class="">下载图片', pageText)[0]
cnt = 1;
while (cnt <= totalPics):
picPath = path + str(cnt) + ".jpg"
r = requests.get(downUrl)
with open(picPath, 'wb') as f:
f.write(r.content)
f.close()
print('{} - 第{}张下载已完成\n'.format(titles[i], cnt))
cnt += 1
nextPageUrl = re.findall(r'href="(.+?)">下一张', pageText)[0]
pageText = getHTMLText(nextPageUrl)
downUrl = re.findall(r'href="(.+?)" class="">下载图片', pageText)[0]
return
def main():
name = input("请输入你喜欢的明星的名字:")
nameUrl = "http://www.win4000.com/mt/" + ''.join(lazy_pinyin(name)) + ".html"
try:
text = getHTMLText(nameUrl)
if not re.findall(r'暂无(.+)!', text):
root = "D://pycharm//" + name + "//"
if not os.path.exists(root):
os.mkdir(root)
downPictures(text, root, name)
try:
nextPage = re.findall(r'next" href="(.+)"', text)[0]
while (nextPage):
nextText = getHTMLText(nextPage)
downPictures(nextText, root, name)
nextPage = re.findall(r'next" href="(.+)"', nextText)[0]
except IndexError:
print("已全部下载完毕")
except TypeError:
print("不好意思,没有{}的照片".format(name))
return
if __name__ == '__main__':
main()