day10、抓取m.sohu.com的所有页面,并且存入redi
2018-08-27 本文已影响0人
是东东
太复杂。本人不会
import requests
from lxml import etree
from urllib.parse import urlparse
from time import sleep
from threading import Thread
from queue import Queue
import sys
from redis import Redis
# 用IDLE来记录线程的空闲
IDLE = 0
# 用WORKING来记录线程的工作状态
WORKING = 1
rds = Redis("127.0.0.1",6379,db=10)
# 定义一个装饰器对象
class retry(object):
def __init__(self,max_tries=3,wait=3,exceptions=(Exception,)):
self.max_tries = max_tries
self.wait = wait
self.exceptions = exceptions
def __call__(self, f):
def wrapper(*args,**kwargs):
for i in range(self.max_tries+1):
try:
result = f(*args,**kwargs)
except self.exceptions as e:
print("waitting",e)
sleep(self.wait) # 如果有异常休眠一会再请求
print("retry %s "%(i+1))
continue
else:
return result
return wrapper
# 定义一个集合,用于存放访问过的网址
REQUESTED_URL = set()
# 下载页面
@retry(3,3)
def fetch(url):
print(f'Fetching: {url}')
res = requests.get(url)
# 请求过以后,把网址添加到集合中
REQUESTED_URL.add(url)
# 请求成功,把页面内容返回出去
if res.status_code == 200:
return res.text
return None
# 解析数据
def paese(html):
# 剔除一些无效的url对应的页面
if html in [None,'',b'']:
return []
# 通过xpath语法寻找a标签
doc = etree.HTML(html)
if doc is None:
return []
# 取当前页面中url
urls = doc.xpath("//a/@href")
# print(urls)
# 对页面上获取到的链接进行清洗
# 定义一个列表,用于存放清洗完成url
url_list = []
for ori_url in urls:
parse_url = urlparse(ori_url) # ParseResult(scheme='', netloc='', path='/a/249902322_102150', params='', query='_f=m-index_business_news_9', fragment='')
# print(parse_url)
# 过滤域名
domain = parse_url.netloc.strip() or "m.sohu.com"
if domain == "m.sohu.com":
# 过滤协议
scheme = parse_url.scheme.strip() or "http"
path = parse_url.path.strip()
query = f'?{parse_url.query}'.strip() if parse_url.query else ''
# 拼接url
url = f'{scheme}://{domain}{path}{query}'
# 把拼接好的url存储
url_list.append(url)
return url_list
# 定义一个函数调用下载与解析
def get_and_parse(url,url_queue):
html = fetch(url)
# paese(html)
# print(html)
for url in paese(html):
url_queue.put(url)
# #定义一个函数,用于处理线程
# def process(url_list):
# queue = Queue()
# workers = []
# for url in url_list:
# t = Thread(target=get_and_parse,args=(url,queue))
# t.setDaemon(True)
# workers.append(t)
# t.start()
# for t in workers:
# t.join()
# return list(queue.queue)
# 创建一个多线程爬虫类
class Spider(Thread):
# def __init__(self,todo_list):
def __init__(self):
super().__init__()
# self.todo_list = todo_list
self.stat = IDLE
def is_idle(self):
return self.stat == IDLE
def run(self):
while True:
url = rds.blpop("TODO_LIST")[1]
# url = self.todo_list.get()
# 开始抓取
self.stat = WORKING
html = fetch(url)
# url_list = set(paese(html))
# url_list -= REQUESTED_URL
url_list = set([url.encode('utf-8') for url in paese(html)])
url_list -= rds.smembers("REQUEST_URL") # 去重
# # 将新得到的url添加到循环里面
# for url in url_list:
# self.todo_list.put(url)
if url_list:
rds.lpush("TODO_LIST",*url_list)
# 把工作状态设置为空闲
self.stat = IDLE
def main(max_threads):
# 添加任务
print("Start")
# todo_list = Queue() # 待抓取的url
# todo_list.put("http://m.sohu.com")
print(rds.lpush("TODO_LIST","http://m.sohu.com/"))
# 创建n个线程,并启动
# spiders = [Spider(todo_list) for i in range(max_threads)]
spiders = [Spider() for i in range(max_threads)]
for spd in spiders:
spd.start()
# 检测所有的线程是否全部完成工作
while True:
# 改成redis键的判断
# if todo_list.empty() and [spd.is_idle() for spd in spiders]:
if rds.llen("TODO_LIST")==0 and [spd.is_idle() for spd in spiders]:
# 当前待抓取的列表为空,所有的线程也全部为空闲,退出程序
print("所有的工作都完成了")
sys.exit(0)
else:
print("REQUESRED %d" % rds.scard("REQUEST_URL"))
sleep(1)
if __name__ == '__main__':
if len(sys.argv) >= 2:
max_threads = int(sys.argv[1])
main(max_threads)