爬虫----->爬取熊猫TV主播及其人气值

2017-12-22  本文已影响79人  _Kantin

平台:VS Code
语言:python
代码链接:https://pan.baidu.com/s/1boIMDaB


from urllib import request
import re

    #目标URL地址
    url = 'https://www.panda.tv/cate/lol'
    #  *? 表示非贪婪的匹配,匹配尽可能少的HTML
    #()-->提取匹配的字符串 [] -->定义匹配的字符范围 {} -->表示匹配的长度
    root_pattern = '<div class="video-info">([\s\S]*?)</div>'
    name_pattern = '></i>([\s\S]*?)</span>'
    number_pattern = '<span class="video-number">([\s\S]*?)</span>'
    def __fetch_content(self):
        #打开对应的url获取HTML
        r = request.urlopen(Spider.url)
        #读取HTML
        htmls = r.read()
        #把byte数据换成字符串
        htmls = str(htmls,encoding='utf-8')
        return htmls

    def __analysis(self,htmls):
        #root_html返回为一个list
        root_html = re.findall(Spider.root_pattern,htmls)
        anchors = [] 
        for html in root_html:
            name = re.findall(Spider.name_pattern,html)
            number = re.findall(Spider.number_pattern,html)
            anchor ={'name':name,'number':number}
            #加入到list集合
            anchors.append(anchor)
        return anchors
    #对爬取内容进行清洗
    def __refine(self,anchors):
         l = lambda anchor:{
             'name':anchor['name'][0].strip(),
             'number':anchor['number'][0]
         }
         #用map函数进行逐一的处理
         return map(l,anchors)
    #应用sorted函数,排序前进对排序规则进行设定
    def __sort_seed(self,anchor):
        r = re.findall('\d*',anchor['number'])
        number = float(r[0])
        #把爬虫中带‘万’字的转成对应的数字
        if '万' in  anchor['number']:
            number *=10000
        return number
    #对value进行排序,从大往小排
    def __sort(self,anchors):
        anchors = sorted(anchors,key=self.__sort_seed,reverse=True)
        return anchors

    def __show(self,anchors):
        for rank in range(0,len(anchors)):
            print("rank: "+str(rank+1)+" name: "+anchors[rank]['name']+" number: "+anchors[rank]['number'])

    def go(self):
       htmls = self.__fetch_content()
       anchors = self.__analysis(htmls)
       anchors = list(self.__refine(anchors))
       anchors = self.__sort(anchors)
       self. __show(anchors)

#类变量相当java的static变量需要用类名调用
#私有函数仅供本类实例调用
spider = Spider()
spider.go()
   
image.png
上一篇 下一篇

猜你喜欢

热点阅读