爬虫模板1
2018-08-03 本文已影响0人
TheoKm
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import re
from time import sleep
from multiprocessing import Pool
from multiprocessing import freeze_support
# 父网页的连接,去除页数
base_url = ""
# 通用头部
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/67.0.3396.79 Safari/537.36',
}
# 初始化url数组
def initialize_url(url):
url_list = []
# lent 是需要的页数
lent = 10
for counter in range(1,lent):
url_list.append(url + str(counter) + '.html')
return url_list
# 通用requests获得html
def get_html(url):
response = requests.get(url, headers=headers)
try:
if response.status_code == 200:
response.encoding =response.apparent_encoding
return response.text
except requests.ConnectionError as e:
print('Error', e.args)
# 解析父网页,获得子网页的url数组、标题数组、日期数组
def parse_father_html(html):
if html is not None:
soup = BeautifulSoup(html, 'html.parser')
if soup:
# 解析父网页内容:
content = soup.find(attrs={'id':''})
data = content.find_all(href=re.compile(""), attrs={'': ''})
date = content.find_all(attrs={'': ''})
# 初始化三个数组:
url_list = []
title_list = []
date_list = []
# 完成三个数组:
for item in data:
# 大多数网页a标签的url都是 "../../*****" 的格式
# 用正则的sub函数补完url全部内容
item_content = re.sub('../', 'http://*.*.com/', str(item.attrs['href']))
url_list.append(item_content)
for item in data:
# 正则捕捉所有汉字,去掉item_content内的非汉字字符
item_content = re.findall('[\u4e00-\u9fa5]', str(item.attrs['title']))
item_content = re.sub('\'', '', str(item_content))
item_content = re.sub('\, ', '', str(item_content))
item_content = re.sub('\[', '', str(item_content))
item_content = re.sub('\]', '', str(item_content))
title_list.append(item_content)
for item in date:
date_list.append(item.text)
if url_list is not None and title_list is not None and date_list is not None:
return url_list, title_list, date_list
else:
print("父网页结构有变,请重新编写父网页解析模块")
else:
print("父网页汤初始化异常")
else:
print("父网页HTML不存在")
# 解析子网页
def parse_child_html(child_html):
if child_html is not None:
child_soup = BeautifulSoup(child_html, 'html.parser')
if child_soup is not None:
# 解析子网页内容,一般情况下,子网页获得网页p标签即可;
# 有时可能还需要获得图片,特殊情况特殊考虑
content = child_soup.find(attrs={'': ''})
if content is not None:
return content
else:
print("网页结构有变请重新编写子网页解析模块")
else:
print("子网页汤初始化异常")
else:
print("子网页HTML不存在")
# 输出子网页的内容:
def out_put_html(html_content, title, time):
# 输出格式 【时间】标题.html
fout = open('【' + time + '】' + title + '.html', 'w+', encoding='utf-8')
fout.write("""<!DOCTYPE html><html lang="zh" dir="ltr"><head>
<meta charset="utf-8"><title></title></head><body>""")
fout.write(str(html_content))
fout.write("</body></html>")
fout.close()
# 主函数
def main(url_item):
# 获得父网页的html
html = get_html(url_item)
# 解析父网页,获得子网页的url数组,标题数组,日期数组
urls, date, dates = parse_father_html(html)
# 获得子网页的数量
lent = len(dates)
# 循环,逐个获得、解析、输出子网页
for counter in range(0, lent):
# 解析子网页html
children_html = get_html(urls[counter])
# 获得子网页的内容
children_content = parse_child_html(children_html)
# 保存子网页到本地
out_put_html(str(children_content), str(date[counter]), str(dates[counter]))
# 每完成一个睡眠1秒
sleep(1)
# run方法
def run():
# freeze_support防止pyinstaller打包在windows环境下运行时导致的内存爆炸
freeze_support()
# 初始化父网页连接数组
url_list = initialize_url(base_url)
# pool进程池,多进程爬虫,提高爬虫运行速度
pool = Pool(10)
pool.map(main, url_list)
pool.close()
pool.join()