Python内容分享活跃站点统计与一些文章链接汇总
2017-07-29 本文已影响180人
treelake
根据PythonWeekly每期推荐的文章或教程的引用来源做简单统计,推测国外python内容分享活跃站点,并简单比较优质python博客或内容发布地的变化。
旧为2012-2013年左右,新为今年(2017)。格式:[(站点根域名, 累计次数),...]
。
(新旧统计期数不同,新:最近20期;旧:最初的81期。只统计文章和教程的链接。)
可以看出Github和Youtube一直是活跃的分享站点。wordpress的份额变少了,blogspot基本看不到了,而medium成为风头正劲。变化还是很大的。
- PythonWeekly的内容相当不错,可是有时太懒没看,错过不少期,又懒得在邮件列表中一个一个翻,于是做了个爬虫,只爬取其中的推荐文章/教程部分,生成一个可以快速浏览的markdown列表,便于筛选感兴趣的文章,在浏览器中打开也很方便搜索。不过,看英文标题的扫视速度还是太慢,申请了个百度翻译的api机翻了标题,虽然不如谷歌翻译,也勉强能看,毕竟看中文还是舒服些。点击下方的链接就可以查看汇总文章了。(简书放不下就放到了github上了,显示效果都差不多)
近期文章汇总
初期文章汇总
- 目前能找到的只有初始81期(对应
initNews.py
)和最近的20期(对应recentNews.py
)(PythonWeekly 一周一期)。使用代码需要替换入你自己的百度翻译api秘钥。 -
initNews.py
和recentNews.py
基本差不多。不过,后者只使用了单线程,比较慢,但是量少,也没花多少时间。前者使用了多线程,速度提升很明显,虽然是4倍的量,但差不多一下子就完了。(此外,百度翻译api中的query使用'\n'分隔可一次翻译多个语句)
代码
initNews.py
import requests
from bs4 import BeautifulSoup
import re
# 请替换为你的秘钥
appid = 'yourappid'
secretKey = 'yoursecretkey'
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'user-agent': ua.chrome}
pythonweekly_init_issues_archive_url = (
'http://www.pythonweekly.com/archive/')
def get_pythonweekly_init_issues_urls():
url = pythonweekly_init_issues_archive_url
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
return [[
a.text.split(' ')[-1].strip(),
''.join([url, a['href']]),
] for a in soup.select('li a')]
pythonweekly_init_issues_urls = get_pythonweekly_init_issues_urls()
def get_single_issue_info(issue):
try:
# issue = [text, url, list]
url = issue[1]
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
content = soup.select_one('td .defaultText')
try:
submenus = [i.text for i in content.find_all('strong')]
for index, menu in enumerate(submenus):
if re.search('[Aa]rticles', menu):
break
start_text = [menu,]
end_text = submenus[index+1]
except:
# 脏改
start_text = ['Articles,\xa0Tutorials and Talks',
'\xa0Tutorials and Talks', # 应对11,12.html
'Articles Tutorials and Talks']
end_text = 'Interesting Projects, Tools and Libraries'
flag = 0
list_ = []
for s in content.find_all('span'):
if not flag:
if s.text not in start_text:
continue
else:
flag = 1
continue
if s.text == end_text:
break
try:
one = [s.text.strip(), s.find('a')['href']]
# print(one)
list_.append(one)
except TypeError:
pass
# return list_
issue.append(list_)
print('下载完成', issue[0])
except Exception as e:
print('wrong: ', issue[0], '\n', e)
from multiprocessing.dummy import Pool
pool = Pool(30)
pool.map(get_single_issue_info, pythonweekly_init_issues_urls)
pythonweekly_init_issues = pythonweekly_init_issues_urls
def baidu_translates(query):
'''
http://api.fanyi.baidu.com/api/trans/product/apidoc
'''
from hashlib import md5
import random
url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
fromLang = 'en'
toLang = 'zh'
salt = random.randint(32768, 65536)
sign = appid + query + str(salt) + secretKey
m1 = md5()
m1.update(sign.encode('utf-8'))
sign = m1.hexdigest()
params = {'appid':appid,
'q':query,
'from':fromLang,
'to':toLang,
'salt':str(salt),
'sign':sign,}
res = requests.get(url, params=params)
return res.json()['trans_result']
def get_translate(issue):
articles = issue[-1]
try:
result = baidu_translates('\n'.join([i[0] for i in articles]))
for index, i in enumerate(articles):
i.append(result[index]['dst'])
print('翻译完成', issue[0])
except:
print('**翻译失败**', issue[0])
pool.map(get_translate, pythonweekly_init_issues)
from jinja2 import Template
table = """
<table>
{% for issue_num, issue_href, article_lists in issues %}
{% for article_name, article_href, article_chinese in article_lists %}
<tr>
<td><a href='{{issue_href}}'>{{ issue_num }}</a></td>
<td><a href='{{article_href}}'>{{ article_name }}</a></td>
<td><a href='{{article_href}}'>{{ article_chinese }}</a></td>
</tr>
{% endfor %}
{% endfor %}
</table>
"""
template = Template(table)
t = template.render(issues=pythonweekly_init_issues)
import time
with open('pythonweekly_init ' + time.ctime().replace(':', '_') + '.html', 'w', encoding='utf-8') as f:
f.write(t)
pool.close()
pool.join()
# https://stackoverflow.com/questions/9626535/get-domain-name-from-url
# get_host = requests.urllib3.util.url.get_host # get_host(i[1])[1]
import tldextract
host_list = [
tldextract.extract(i[1]).domain
for *_, articles in pythonweekly_init_issues for i in articles ]
from collections import Counter
counter = Counter(host_list)
print(counter.most_common(20))
with open('pythonweekly_init.md', 'w', encoding='utf-8') as f:
f.write(u'### PythonWeekly初期文章教程汇总\n')
f.write(u'| 期号 | 英文名 | 中文名|\n')
f.write(u'| ------------- |:-------------:| -----:|\n')
for issue_num, issue_href, article_lists in pythonweekly_init_issues:
for article_name, article_href, article_chinese in article_lists:
f.write(('| [{issue_num}]({issue_href}) '
'| [{article_name}]({article_href}) '
'| [{article_chinese}]({article_href}) '
'| \n').format(**locals()))
recentNews.py
import requests
from bs4 import BeautifulSoup
import re
# 请替换为你的秘钥
appid = 'yourappid'
secretKey = 'yoursecretkey'
from fake_useragent import UserAgent
ua = UserAgent()
headers = {'user-agent': ua.chrome}
pythonweekly_recent_issues_archive_url = (
'http://us2.campaign-archive2.com/home/'
'?u=e2e180baf855ac797ef407fc7&id=9e26887fc5')
def get_pythonweekly_recent_issues_urls():
res = requests.get(pythonweekly_recent_issues_archive_url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
return [[
a.text.split(' ')[-1].strip(),
a['href'],
]
for a in soup.select('li a')]
pythonweekly_recent_issues_urls = get_pythonweekly_recent_issues_urls()
def get_single_issue_info(url):
res = requests.get(url, headers=headers)
soup = BeautifulSoup(res.content, 'lxml')
content = soup.select_one('td .defaultText')
submenus = [i.text for i in content.find_all('span', attrs={'style':"color:#B22222"})]
for index, i in enumerate(submenus):
if re.search('[Aa]rticles', i):
break
start_text = i
end_text = submenus[index+1]
flag = 0
list_ = []
for s in content.find_all('span'):
if not flag:
if s.text != start_text:
continue
else:
flag = 1
continue
if s.text == end_text:
break
try:
one = [s.text.strip(), s.find('a')['href']]
# print(one)
list_.append(one)
except TypeError:
pass
return list_
for i in pythonweekly_recent_issues_urls:
# [text, url, list]
print(i[0])
i.append(get_single_issue_info(i[1]))
pythonweekly_recent_issues = pythonweekly_recent_issues_urls
def baidu_translate(query):
'''
http://api.fanyi.baidu.com/api/trans/product/apidoc
'''
from hashlib import md5
import random
url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
fromLang = 'en'
toLang = 'zh'
salt = random.randint(32768, 65536)
sign = appid + query + str(salt) + secretKey
m1 = md5()
m1.update(sign.encode('utf-8'))
sign = m1.hexdigest()
params = {'appid':appid,
'q':query,
'from':fromLang,
'to':toLang,
'salt':str(salt),
'sign':sign,}
res = requests.get(url, params=params)
return res.json()['trans_result'][0]['dst']
for *_, articles in pythonweekly_recent_issues:
for i in articles:
i.append(baidu_translate(i[0]))
print('done')
from jinja2 import Template
table = """
<table>
{% for issue_num, issue_href, article_lists in issues %}
{% for article_name, article_href, article_chinese in article_lists %}
<tr>
<td><a href='{{issue_href}}'>{{ issue_num }}</a></td>
<td><a href='{{article_href}}'>{{ article_name }}</a></td>
<td><a href='{{article_href}}'>{{ article_chinese }}</a></td>
</tr>
{% endfor %}
{% endfor %}
</table>
"""
template = Template(table)
t = template.render(issues=pythonweekly_recent_issues)
import time
with open('pythonweekly_recent ' + time.ctime().replace(':', '_') + '.html', 'w', encoding='utf-8') as f:
f.write(t)
import tldextract
host_list = [
tldextract.extract(i[1]).domain
for *_, articles in pythonweekly_recent_issues for i in articles ]
from collections import Counter
counter = Counter(host_list)
counter.most_common(20)
with open('pythonweekly_recent.md', 'w', encoding='utf-8') as f:
f.write(u'### PythonWeekly文章教程近期汇总\n')
f.write(u'| 期号 | 英文名 | 中文名|\n')
f.write(u'| ------------- |:-------------:| -----:|\n')
for issue_num, issue_href, article_lists in pythonweekly_recent_issues:
for article_name, article_href, article_chinese in article_lists:
f.write(('| [{issue_num}]({issue_href}) '
'| [{article_name}]({article_href}) '
'| [{article_chinese}]({article_href}) '
'| \n').format(**locals()))