Python 爬虫 - 爬虫-正则表达式
2019-01-03 本文已影响0人
莫名ypc
import requests
import re
# 获取单个网页
def get_page(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.text
return None
# 获取单个网页
def get_resource(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url=url, headers=headers)
if response.status_code == 200:
return response.content
return None
# 保存图片
def save_pic(url):
img_content = get_resource(url)
file_name = url.split('/')[-1].split("@")[0]
with open('./images/%s' % file_name, 'wb') as f:
f.write(img_content)
# 获取所有网页
def get_all_pages():
result_all_list = []
for i in range(10):
page = i * 10
url = 'http://maoyan.com/board/4?offset=%d' % page
html = get_page(url)
result_list = parse_page(html)
result_all_list.extend(result_list)
return result_all_list
def strips(l):
result_list = []
for item in l:
result_list.append(item.strip())
return result_list
def parse_score(socre_html):
pattern = re.compile('<i class="integer">(.*?)</i><i class="fraction">(.*?)</i>', re.S)
score = re.findall(pattern, socre_html)
return score
# 保存json数据
def save_json(result_list):
result_json_str = json.dumps(result_list, ensure_ascii=False)
with open('maoyan.json', 'w', encoding='utf-8') as f:
f.write(result_json_str)
# 解析网页
def parse_page(html):
result_list = []
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern, html)
actors = strips(actors)
pattern = re.compile('movieId.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
movie_names = re.findall(pattern, html)
movie_names = strips(movie_names)
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
release_time = re.findall(pattern, html)
release_time = strips(release_time)
# 排名
pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
index = re.findall(pattern, html)
index = strips(index)
# 链接
pattern = re.compile('<p class="name">.*?<a href="(.*?)"', re.S)
link = re.findall(pattern, html)
link = strips(link)
# 图片链接
pattern = re.compile('movieId.*?>.*?<img.*?<img data-src="(.*?)"', re.S)
img_link = re.findall(pattern, html)
for pic_url in img_link:
save_pic(pic_url)
pattern = re.compile('<p class="score">(.*?)</p>', re.S)
score = re.findall(pattern, html)
score = strips(score)
score_list = []
for score_html in score:
scores = parse_score(score_html)
score = ''.join(scores[0])
score_list.append(score)
# 组合结果列表
for i in range(len(movie_names)):
item = {}
item['title'] = movie_names[i]
item['actor'] = actors[i]
item['release_time'] = release_time[i]
item['index'] = index[i]
item['link'] = link[i]
item['img_link'] = img_link[i]
item['score_list'] = score_list[i]
result_list.append(item)
return result_list
def main():
# url = 'http://maoyan.com/board/4'
# # 获取网页
# html = get_page(url)
# result_list = parse_page(html)
result_list = get_all_pages()
save_json(result_list)
if __name__ == '__main__':
main()