Python研究刚刚开始首页投稿(暂停使用,暂停投稿)生活不易 我用python

Python: Requests/Aiohttp--同步/异步爬

2017-06-25  本文已影响239人  疯狂的向日葵

1.request同步版本

1.1Code:

# -*- coding: utf-8 -*-

'''
dengta news list`
'''

__author__ = 'Jimmy'

import requests
from bs4 import BeautifulSoup
import re
import time


class News:
    def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
        self.title = title
        self.abstract = abstract
        self.detailUrl = detailUrl
        self.impact = impact
        self.source = source
        self.content = content

    def printNews(self):
        print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
class Page:
    def __init__(self,newsCount=0,pageCount=0):
        self.newsCount = newsCount
        self.pageCount = pageCount


def getNewsPageCount(code):
    url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    paper = soup.find_all('div', class_='pager')[0]
    newsCount = paper.span.string
    ncount = re.sub('\D','',newsCount)
    page = Page()
    page.newsCount = int(ncount)
    for c in paper.children:
        if c.string == '末页':
            url = c['href']
            pageCount = url.split('/')[-1].split('.')[0]
            page.pageCount = int(pageCount)
            return page

def getSingleNewsList(code,page):
    url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    nl = soup.find_all('ul', class_='news_list')[0]
    newsList = []
    for li in nl.children:
        for a in li:
            news = News()
            news.detailUrl = 'http://www.wedengta.com%s' % a['href']
            sc = getSingleNewsDetail(news.detailUrl)
            if sc :
                news.title = a.h3.string
                news.abstract = a.p.string
                news.impact = a.span.string
                news.source = sc[0]
                news.content = sc[1]
                newsList.append(news)
                news.printNews()
    return newsList

def getSingleNewsDetail(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    source = soup.find_all('div',class_='news_info')[0]
    content = soup.find_all('div',id='newsContent')[0]
    if content.div == None:
        return [source.string, str(content)]
    else:
        return None

def getAllNewsList(code):
    newsList = []
    print('获取%s的资讯个数' %code)
    page = getNewsPageCount(code)
    print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
    if page.newsCount > 0:
        for page in range(1, page.pageCount):
            aNewsList = getSingleNewsList(code, page)
            newsList.extend(aNewsList)
    return newsList



start = time.time()
list = getAllNewsList('600585')
end = str(time.time() - start)
print('共用时%s' % end)
print(len(list))

# getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498212464_869774_10_1.html')
# getSingleNewsDetail('http://www.wedengta.com/news/newsDetail/1/1498213693_9569133_9_1.html')

1.2结果:

image.png

2.同步的速度实在太低了,aiohttp异步版本

2.1Code:

# -*- coding: utf-8 -*-

'''
aiohttp
'''

__author__ = 'Jimmy'

import aiohttp
import asyncio
import requests
from bs4 import BeautifulSoup
import re
import time

class News:
    def __init__(self,title='',abstract='',detailUrl='',impact='',source='',content=''):
        self.title = title
        self.abstract = abstract
        self.detailUrl = detailUrl
        self.impact = impact
        self.source = source
        self.content = content

    def printNews(self):
        print('标题:%s \n来源:%s\n摘要:%s\n影响:%s\n地址:%s\n内容:%s' %(self.title,self.source,self.abstract,self.impact,self.detailUrl,self.content))
class Page:
    def __init__(self,newsCount=0,pageCount=0):
        self.newsCount = newsCount
        self.pageCount = pageCount

def getNewsPageCount(code):
    url = 'http://www.wedengta.com/stockDetail/0101%s/news/1.html' % code
    r = requests.get(url)
    soup = BeautifulSoup(r.text, 'html.parser')
    paper = soup.find_all('div', class_='pager')[0]
    newsCount = paper.span.string
    ncount = re.sub('\D','',newsCount)
    page = Page()
    page.newsCount = int(ncount)
    for c in paper.children:
        if c.string == '末页':
            url = c['href']
            pageCount = url.split('/')[-1].split('.')[0]
            page.pageCount = int(pageCount)
            return page

async def getSingleNewsList(code,page,newsList):
    url = 'http://www.wedengta.com/stockDetail/0101%s/news/%d.html' % (code,page)
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as r:
            body = await r.text(encoding='utf-8')
            soup = BeautifulSoup(body, 'html.parser')
            nl = soup.find_all('ul', class_='news_list')[0]
            for li in nl.children:
                for a in li:
                    news = News()
                    news.detailUrl = 'http://www.wedengta.com%s' % a['href']
                    sc = await getSingleNewsDetail(news.detailUrl)
                    if sc:
                        news.title = a.h3.string
                        news.abstract = a.p.string
                        news.impact = a.span.string
                        news.source = sc[0]
                        news.content = sc[1]
                        newsList.append(news)
                        news.printNews()

async def getSingleNewsDetail(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as r:
            body = await r.text(encoding='utf-8')
            soup = BeautifulSoup(body, 'html.parser')
            source = soup.find_all('div', class_='news_info')[0]
            content = soup.find_all('div', id='newsContent')[0]
            if content.div == None:
                return [source.string, str(content)]
            else:
                return None

def getAllNewsList(code):
    newsList = []
    print('获取%s的资讯个数' %code)
    page = getNewsPageCount(code)
    print('共%d页,合计%d条' %(page.pageCount,page.newsCount))
    if page.newsCount > 0:
        loop = asyncio.get_event_loop()
        tasks = [getSingleNewsList(code,pc,newsList) for pc in range(1, page.pageCount+1)]
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
    return newsList


start = time.time()
list = getAllNewsList('600585')
end = str(time.time() - start)
print('共用时%s' % end)
print(len(list))
上一篇 下一篇

猜你喜欢

热点阅读