简单爬虫练习:爬虫文章

2019-12-27  本文已影响0人  绛珠仙靖
#author: Jingke
from bs4 import BeautifulSoup
import ssl
from urllib.request import Request, urlopen
import urllib.request


class Scrape_news():
   @classmethod
   def url_link(cls, url, *args, **kwargs):
       ssl._create_default_https_context = ssl._create_unverified_context
       opener = urllib.request.build_opener()
       opener.addheaders = [('User-Agent',
                             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
       urllib.request.install_opener(opener)
       req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

       web = urlopen(req)
       bsObj = BeautifulSoup(web)
       news = bsObj.find_all(*args, **kwargs)

       list = []
       for new in news:
           list.append(new.get('href'))

       l=[]
       for i in list:
           if "articles" in i:
               i = "http://www.qdaily.com/" + i
               l.append(i)
       print(l)
       return l


if __name__ == '__main__':
   # Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})
   Scrape_news.url_link('http://www.qdaily.com', "a")

result:
['http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64790.html', 'http://www.qdaily.com//articles/64771.html', 'http://www.qdaily.com//articles/64794.html', 'http://www.qdaily.com//articles/64764.html', 'http://www.qdaily.com//articles/64696.html', 'http://www.qdaily.com//articles/64935.html', 'http://www.qdaily.com//articles/64924.html', 'http://www.qdaily.com//articles/64933.html', 'http://www.qdaily.com//articles/64934.html', 'http://www.qdaily.com//articles/64923.html', 'http://www.qdaily.com//articles/64921.html', 'http://www.qdaily.com//articles/64930.html', 'http://www.qdaily.com//articles/64931.html', 'http://www.qdaily.com//articles/64927.html', 'http://www.qdaily.com//articles/64922.html', 'http://www.qdaily.com//articles/64929.html', 'http://www.qdaily.com//articles/64928.html', 'http://www.qdaily.com//articles/64925.html', 'http://www.qdaily.com//articles/64926.html', 'http://www.qdaily.com//articles/64919.html', 'http://www.qdaily.com//articles/64920.html', 'http://www.qdaily.com//articles/64904.html']

------------------------------------------------------------------------------------------------------------------#

#author: Jingke

class Scrape_news():
    @classmethod
    def url_link(cls, url, *args, **kwargs):
        ssl._create_default_https_context = ssl._create_unverified_context
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-Agent',
                              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
        urllib.request.install_opener(opener)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})

        web = urlopen(req)
        bsObj = BeautifulSoup(web)
        news = bsObj.find_all(*args, **kwargs)

        list=[]
        for new in news:
            list.append(new.get_text())
        print(list)
        return list


if __name__ == '__main__':
    Scrape_news.url_link('http://www.qdaily.com', "h3", {"class": "smart-dotdotdot"})

result:
['重新认识人性的可能,如何看待 18 世纪英国平民文化?',
'两次世界大战之间的日本陆军,他们如何走向战争?',
'艾滋病如何在美国被发现,又怎样展现人性的复杂?',
'卢梭研究经典,我们该如何理解卢梭的孤独?',
'社交媒体和数字技术的发展,如何改变传统人际关系?',
'如果爱情让人自身和自身保持同一,那它可能是什么?',
'130 幅城市复原图,如何重现古地中海文明?',
'从 1931 到 1945 年,日本人的思想发生了什么转变?',
'百年以来,什么是中国文人论政的报国情怀?']

上一篇下一篇

猜你喜欢

热点阅读