简单爬虫练习:爬虫文章
#author: Jingke
from bs4 import BeautifulSoup
import ssl
from urllib.request import Request, urlopen
import urllib.request
class Scrape_news():
@classmethod
def url_link(cls, url, *args, **kwargs):
ssl._create_default_https_context = ssl._create_unverified_context
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web = urlopen(req)
bsObj = BeautifulSoup(web)
news = bsObj.find_all(*args, **kwargs)
list = []
for new in news:
list.append(new.get('href'))
l=[]
for i in list:
if "articles" in i:
i = "http://www.qdaily.com/" + i
l.append(i)
print(l)
return l
if __name__ == '__main__':
# Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
Scrape_news.url_link('http://www.qdaily.com', "a")
result:
['http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64935.html', 'http://www.qdaily.com//articles/64924.html', 'http://www.qdaily.com//articles/64933.html', 'http://www.qdaily.com//articles/64934.html', 'http://www.qdaily.com//articles/64923.html', 'http://www.qdaily.com//articles/64921.html', 'http://www.qdaily.com//articles/64930.html', 'http://www.qdaily.com//articles/64931.html', 'http://www.qdaily.com//articles/64927.html', 'http://www.qdaily.com//articles/64922.html', 'http://www.qdaily.com//articles/64929.html', 'http://www.qdaily.com//articles/64928.html', 'http://www.qdaily.com//articles/64925.html', 'http://www.qdaily.com//articles/64926.html', 'http://www.qdaily.com//articles/64919.html', 'http://www.qdaily.com//articles/64920.html', 'http://www.qdaily.com//articles/64904.html']
------------------------------------------------------------------------------------------------------------------#
#author: Jingke
class Scrape_news():
@classmethod
def url_link(cls, url, *args, **kwargs):
ssl._create_default_https_context = ssl._create_unverified_context
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
web = urlopen(req)
bsObj = BeautifulSoup(web)
news = bsObj.find_all(*args, **kwargs)
list=[]
for new in news:
list.append(new.get_text())
print(list)
return list
if __name__ == '__main__':
Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
result:
['重新认识人性的可能,如何看待 18 世纪英国平民文化?',
'两次世界大战之间的日本陆军,他们如何走向战争?',
'艾滋病如何在美国被发现,又怎样展现人性的复杂?',
'卢梭研究经典,我们该如何理解卢梭的孤独?',
'社交媒体和数字技术的发展,如何改变传统人际关系?',
'如果爱情让人自身和自身保持同一,那它可能是什么?',
'130 幅城市复原图,如何重现古地中海文明?',
'从 1931 到 1945 年,日本人的思想发生了什么转变?',
'百年以来,什么是中国文人论政的报国情怀?']