爬虫–多线程下载网络表情包
2018-10-17 本文已影响0人
真夜猫
在这个互联网时代,网络社交成为了我们生活中必不可少的一部分,表情包更是成为了新的交流语言。想在斗图的时候不输给别人吗,下面小编给大家带来多线程下载斗图网表情包的代码,让你永远比别人更强。
import requests
import threading
from lxml import etree
from urllib import request
from copyheaders import headers_raw_to_dict
import os
import re
from queue import Queue
以上是爬虫所需要导入的库
class Producer(threading.Thread):
headers = b'''
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:max-age=0
Connection:keep-alive
Cookie:td_cookie=18446744071663087930; __cfduid=da1e314d1be1ba8cb3058750c9b4070ad1526538626; UM_distinctid=1636ccae4fc197-07fc2be69e5f26-7b177971-1fa400-1636ccae4fe33e; yjs_id=b9097d3365aece9094783e91887b271b; ctrl_time=1; XSRF-TOKEN=eyJpdiI6IjFOcGxNVFNNa0pUUmtiZGtxWG16QUE9PSIsInZhbHVlIjoibkNhN0NjSlhZZlFRVGdtd0k3ZTAreENEck1TcERnVVc1S2NiK01kVDRvNkI2SmVCRkFLZE1FUUlRZEVNd2FqZkdWRE9JUFpTeUs4eWJ5YmxcL1BzSzJRPT0iLCJtYWMiOiJlZDY1NmRmYjE3MDZmMDFiNTkxNGFhZTMyNzU4YzdlOGU5ZWQyYWIxNjZmMzBmM2U0ZThhNGFkYzEzZTZlYjE0In0%3D; laravel_session=eyJpdiI6ImZrWTZIRTVOTFBEeWJ3a2tCeDVmc0E9PSIsInZhbHVlIjoiMnlrTDVJbklCQ1Vkc052ZU05XC9Ga2hwYlVmRHhRVXZFaFdwTzRiMXdLV1E3TTcyYzUxbFF1dFNFK0IrNVhqMmVaK0RCVlg0ang3Y1JEcUkzeDZkMWVBPT0iLCJtYWMiOiIwZGRlODRlYTEzODk5YzQ4NzA2N2VkMTY1NzI1MjIyNzE0NTM5ZDA3MDc5NjZkNGM2Y2Y3MDE5ZDBjODFhZWVlIn0%3D; _ga=GA1.2.206995134.1526538626; _gid=GA1.2.7468436.1526538626; CNZZDATA1256911977=1292825661-1526535471-http%253A%252F%252Fdoutula.com%252F%7C1526535471
Host:www.doutula.com
Referer:http://www.doutula.com/photo/list/?page=2
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12814.400
'''
headers = headers_raw_to_dict(headers)
def __init__(self,page_queue,img_queue,*args,**kwargs): #*args代表任意未知参数,**kwargs代表任意的关键字参数,通过这两个参数就可以包含任意的参数
super(Producer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url=self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
response = requests.get(url,headers=self.headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
# print(etree.tostring(img,encoding="utf8").decode("utf8"))
img_url = img.get("data-original")
alt = img.get("alt")
alt = re.sub(r'[\??\.。,!!]', '', alt)
suffix = os.path.splitext(img_url)[1]
print(img_url, alt, suffix)
filename = alt + suffix
self.img_queue.put((img_url,filename))
将网页页面的图片url与名字获取,put进img_queue队列
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename=self.img_queue.get()
os.chdir(r"C:\Users\****\Desktop")
request.urlretrieve(img_url,'斗图网/'+filename)
print(filename+"下载完成")
将img_queue队列中的图片名字与url提取出来进行下载保存
def main():
page_queue=Queue(100)
img_queue=Queue(1000)
for x in range(1,101):
url="http://www.doutula.com/photo/list/?page=%d"%x
page_queue.put(url)
for x in range(6):
t=Producer(page_queue,img_queue)
t.start()
for x in range(6):
t=Consumer(page_queue,img_queue)
t.start()
if __name__=="__main__":
main()
主函数代码,复制创建队列,控制线程数。上面的Producer与Consumer创建的这两个类继承的是threading.Thread,就是为了多线程同步下载。
完整代码如下:
import requests
import threading
from lxml import etree
from urllib import request
from copyheaders import headers_raw_to_dict
import os
import re
from queue import Queue
class Producer(threading.Thread):
headers = b'''
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip, deflate, sdch
Accept-Language:zh-CN,zh;q=0.8
Cache-Control:max-age=0
Connection:keep-alive
Cookie:td_cookie=18446744071663087930; __cfduid=da1e314d1be1ba8cb3058750c9b4070ad1526538626; UM_distinctid=1636ccae4fc197-07fc2be69e5f26-7b177971-1fa400-1636ccae4fe33e; yjs_id=b9097d3365aece9094783e91887b271b; ctrl_time=1; XSRF-TOKEN=eyJpdiI6IjFOcGxNVFNNa0pUUmtiZGtxWG16QUE9PSIsInZhbHVlIjoibkNhN0NjSlhZZlFRVGdtd0k3ZTAreENEck1TcERnVVc1S2NiK01kVDRvNkI2SmVCRkFLZE1FUUlRZEVNd2FqZkdWRE9JUFpTeUs4eWJ5YmxcL1BzSzJRPT0iLCJtYWMiOiJlZDY1NmRmYjE3MDZmMDFiNTkxNGFhZTMyNzU4YzdlOGU5ZWQyYWIxNjZmMzBmM2U0ZThhNGFkYzEzZTZlYjE0In0%3D; laravel_session=eyJpdiI6ImZrWTZIRTVOTFBEeWJ3a2tCeDVmc0E9PSIsInZhbHVlIjoiMnlrTDVJbklCQ1Vkc052ZU05XC9Ga2hwYlVmRHhRVXZFaFdwTzRiMXdLV1E3TTcyYzUxbFF1dFNFK0IrNVhqMmVaK0RCVlg0ang3Y1JEcUkzeDZkMWVBPT0iLCJtYWMiOiIwZGRlODRlYTEzODk5YzQ4NzA2N2VkMTY1NzI1MjIyNzE0NTM5ZDA3MDc5NjZkNGM2Y2Y3MDE5ZDBjODFhZWVlIn0%3D; _ga=GA1.2.206995134.1526538626; _gid=GA1.2.7468436.1526538626; CNZZDATA1256911977=1292825661-1526535471-http%253A%252F%252Fdoutula.com%252F%7C1526535471
Host:www.doutula.com
Referer:http://www.doutula.com/photo/list/?page=2
Upgrade-Insecure-Requests:1
User-Agent:Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.3538.400 QQBrowser/9.6.12814.400
'''
headers = headers_raw_to_dict(headers)
def __init__(self,page_queue,img_queue,*args,**kwargs): #*args代表任意未知参数,**kwargs代表任意的关键字参数,通过这两个参数就可以包含任意的参数
super(Producer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url=self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
response = requests.get(url,headers=self.headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
for img in imgs:
# print(etree.tostring(img,encoding="utf8").decode("utf8"))
img_url = img.get("data-original")
alt = img.get("alt")
alt = re.sub(r'[\??\.。,!!]', '', alt)
suffix = os.path.splitext(img_url)[1]
print(img_url, alt, suffix)
filename = alt + suffix
self.img_queue.put((img_url,filename))
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.page_queue=page_queue
self.img_queue=img_queue
def run(self):
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename=self.img_queue.get()
os.chdir(r"C:\Users\****\Desktop")
request.urlretrieve(img_url,'斗图网/'+filename)
print(filename+"下载完成")
def main():
page_queue=Queue(100)
img_queue=Queue(1000)
for x in range(1,101):
url="http://www.doutula.com/photo/list/?page=%d"%x
page_queue.put(url)
for x in range(6):
t=Producer(page_queue,img_queue)
t.start()
for x in range(6):
t=Consumer(page_queue,img_queue)
t.start()
if __name__=="__main__":
main()