Python原生爬虫小demo
from urllib import request
import re
'''
可用 beautifulSoup scrapy 框架,爬虫,反爬虫,反反爬虫,ip封,代理ip
获取内容
提取内容
精炼内容
内容排序
输出,存数据库等
'''
class Spider():
url ="https://www.panda.tv/cate/lol"
root_pattern ='<div class="video-info">([\s\S]*?)</div>' # 正则匹配,[]表区间, *匹配无限多次,?非贪婪(匹配0次或者一次),()只匹配中间部分
name_pattern ='([\s\S]*?)'
number_pattern ='([\s\S]*?)'
def __fetch_content(self):# 获取内容
r = request.urlopen(Spider.url)
htmls = r.read()
htmls =str(htmls, encoding='utf-8')
return htmls
def __analysis(self, htmls):# 提取内容
root_html = re.findall(spider.root_pattern, htmls)
# print(root_html[0])
anchors = []
for htmlin root_html:
name = re.findall(Spider.name_pattern, html)
number =re.findall(Spider.number_pattern, html)
anchor = {'name':name,'number':number}
anchors.append(anchor)
return anchors
def __refine(self, anchors):# 精炼 (去掉空白,换行符等)
l =lambda anchors: {# lambda 表达式
'name': anchors['name'][0].strip(),
'number': anchors['number'][0]
}
return list(map(l, anchors))
def __sort(self, anchors):# 排序
anchors =sorted(anchors, key=self.__sort_seed, reverse=True)# reverse 控制排序升降
return anchors
def __sort_seed(self, anchor):# 排序键 有万字的要按乘以10000计
r = re.findall('\d*', anchor['number'])
number =float(r[0])
if '万' in anchor['number']:
number *=10000
return number
def __show(self, anchors):# 打印输出
for rankin range(0, len(anchors)):
print('rank:'+str(rank+1)+';'+'name:'+anchors[rank]['name'] +';' +'number:' + anchors[rank]['number']+';')
def go(self):
htmls =self.__fetch_content()
anchors =self.__analysis(htmls)
anchors =self.__refine(anchors)
self.__show(anchors)
spider = Spider()
spider.go()
爬取结果