爬简书7日热门
2017-04-22 本文已影响126人
_CallMe靠谱叔
这一周忙成狗,终于在周六把这个入门级的小爬虫写粗来了。
推荐这个专题:如何学习Python爬虫[入门篇]? - 知乎专栏
推荐关注:向右奔跑 - 简书
还有@向右奔跑 老师的 Python 爬虫学习群『Python爬虫小分队』群招募公告 - 简书
# -*-coding:utf-8-*-
import sys
import csv
import requests
from bs4 import BeautifulSoup
# 这两行解决编码问题
reload(sys)
sys.setdefaultencoding('utf-8')
base_url = 'http://www.jianshu.com/trending/weekly'
# 请求单个页面
def getHtml(url):
r = requests.get(url)
return r.text
# 解析单个页面,返回该页面所有文章的抓取字段
def parseHtml(html):
soup = BeautifulSoup(html, 'lxml')
articles = []
for article in soup.find_all(class_='content'):
title = article.find(class_='title').string
link = 'http://www.jianshu.com' + article.find(class_='title').get('href')
author = article.find(class_='blue-link').string
time = article.span['data-shared-at']
meta = article.find(class_='meta').find_all(['a', 'span'])
metas = []
for item in meta:
metas.append(item.get_text().strip())
read = metas[0]
comment = metas[1]
like = metas[2]
try:
money = metas[3]
except:
money = None
articles.append([title, author,time, read, comment, like, money,link])
return articles
# 写入csv
def writeCSV(file,data_list):
with open(file,'wb') as f:
writer = csv.writer(f)
writer.writerow(['文章标题', '作者', '时间','阅读量', '评论', '喜欢', '赞赏数','文章地址'])
for data in data_list:
for row in data:
writer.writerow(row)
if __name__ == '__main__':
data_list = []
for i in range(1,7):
url = base_url + '?page={}'.format(i)
html = getHtml(url)
data = parseHtml(html)
data_list.append(data)
writeCSV('jianshu.csv',data_list)